[llvm] [AMDGPU][gfx1250] Support "cluster" syncscope (PR #157641)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 10 02:04:36 PDT 2025


https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/157641

>From 57ba44ed4de6c45bb874e6596dd642086b21cf0f Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Sep 2025 12:40:28 +0200
Subject: [PATCH] [AMDGPU][gfx1250] Support "cluster" syncscope

Defaults to "agent" for targets that do not support it.

- Add documentation
- Register it in MachineModuleInfo
- Add MemoryLegalizer support
---
 llvm/docs/AMDGPUUsage.rst                     |    21 +
 .../Target/AMDGPU/AMDGPUMachineModuleInfo.cpp |     2 +
 .../Target/AMDGPU/AMDGPUMachineModuleInfo.h   |    20 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |     3 +
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |    51 +-
 .../CodeGen/AMDGPU/memory-legalizer-fence.ll  |   795 +
 .../AMDGPU/memory-legalizer-flat-cluster.ll   | 25726 ++++++++++++++++
 .../AMDGPU/memory-legalizer-global-cluster.ll | 23810 ++++++++++++++
 .../AMDGPU/memory-legalizer-local-cluster.ll  | 18467 +++++++++++
 .../memory-legalizer-private-cluster.ll       | 23347 ++++++++++++++
 10 files changed, 92224 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 2cddc3365d5d7..c7d515aeb012f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -537,6 +537,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                       - Packed
                                                                         work-item                       Add product
                                                                         IDs                             names.
+                                                                      - Workgroup
+                                                                        Clusters
 
      =========== =============== ============ ===== ================= =============== =============== ======================
 
@@ -1095,6 +1097,22 @@ is conservatively correct for OpenCL.
                              - ``wavefront`` and executed by a thread in the
                                same wavefront.
 
+     ``cluster``             Synchronizes with, and participates in modification
+                             and seq_cst total orderings with, other operations
+                             (except image operations) for all address spaces
+                             (except private, or generic that accesses private)
+                             provided the other operation's sync scope is:
+
+                             - ``system``, ``agent`` or ``cluster`` and
+                               executed by a thread on the same cluster.
+                             - ``workgroup`` and executed by a thread in the
+                               same work-group.
+                             - ``wavefront`` and executed by a thread in the
+                               same wavefront.
+
+                             On targets that do not support workgroup cluster
+                             launch mode, this behaves like ``agent`` scope instead.
+
      ``workgroup``           Synchronizes with, and participates in modification
                              and seq_cst total orderings with, other operations
                              (except image operations) for all address spaces
@@ -1128,6 +1146,9 @@ is conservatively correct for OpenCL.
      ``agent-one-as``        Same as ``agent`` but only synchronizes with other
                              operations within the same address space.
 
+     ``cluster-one-as``      Same as ``cluster`` but only synchronizes with other
+                             operations within the same address space.
+
      ``workgroup-one-as``    Same as ``workgroup`` but only synchronizes with
                              other operations within the same address space.
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
index eda479064d7b2..d09b7cffe9f29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
@@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
   AgentSSID = CTX.getOrInsertSyncScopeID("agent");
   WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
   WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
+  ClusterSSID = CTX.getOrInsertSyncScopeID("cluster");
   SystemOneAddressSpaceSSID =
       CTX.getOrInsertSyncScopeID("one-as");
   AgentOneAddressSpaceSSID =
@@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
       CTX.getOrInsertSyncScopeID("wavefront-one-as");
   SingleThreadOneAddressSpaceSSID =
       CTX.getOrInsertSyncScopeID("singlethread-one-as");
+  ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as");
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index fcb0c8cfb7ca6..bf852bb38376e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -32,6 +32,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
   SyncScope::ID WorkgroupSSID;
   /// Wavefront synchronization scope ID (cross address space).
   SyncScope::ID WavefrontSSID;
+  /// Cluster synchronization scope ID (cross address space).
+  SyncScope::ID ClusterSSID;
   /// System synchronization scope ID (single address space).
   SyncScope::ID SystemOneAddressSpaceSSID;
   /// Agent synchronization scope ID (single address space).
@@ -42,6 +44,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
   SyncScope::ID WavefrontOneAddressSpaceSSID;
   /// Single thread synchronization scope ID (single address space).
   SyncScope::ID SingleThreadOneAddressSpaceSSID;
+  /// Cluster synchronization scope ID (single address space).
+  SyncScope::ID ClusterOneAddressSpaceSSID;
 
   /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
@@ -60,12 +64,15 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
     else if (SSID == getWorkgroupSSID() ||
              SSID == getWorkgroupOneAddressSpaceSSID())
       return 2;
+    else if (SSID == getClusterSSID() ||
+             SSID == getClusterOneAddressSpaceSSID())
+      return 3;
     else if (SSID == getAgentSSID() ||
              SSID == getAgentOneAddressSpaceSSID())
-      return 3;
+      return 4;
     else if (SSID == SyncScope::System ||
              SSID == getSystemOneAddressSpaceSSID())
-      return 4;
+      return 5;
 
     return std::nullopt;
   }
@@ -73,7 +80,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
   /// \returns True if \p SSID is restricted to single address space, false
   /// otherwise
   bool isOneAddressSpace(SyncScope::ID SSID) const {
-    return SSID == getSingleThreadOneAddressSpaceSSID() ||
+    return SSID == getClusterOneAddressSpaceSSID() ||
+           SSID == getSingleThreadOneAddressSpaceSSID() ||
            SSID == getWavefrontOneAddressSpaceSSID() ||
            SSID == getWorkgroupOneAddressSpaceSSID() ||
            SSID == getAgentOneAddressSpaceSSID() ||
@@ -95,6 +103,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
   SyncScope::ID getWavefrontSSID() const {
     return WavefrontSSID;
   }
+  /// \returns Cluster synchronization scope ID (cross address space).
+  SyncScope::ID getClusterSSID() const { return ClusterSSID; }
   /// \returns System synchronization scope ID (single address space).
   SyncScope::ID getSystemOneAddressSpaceSSID() const {
     return SystemOneAddressSpaceSSID;
@@ -115,6 +125,10 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
   SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
     return SingleThreadOneAddressSpaceSSID;
   }
+  /// \returns Single thread synchronization scope ID (single address space).
+  SyncScope::ID getClusterOneAddressSpaceSSID() const {
+    return ClusterOneAddressSpaceSSID;
+  }
 
   /// In AMDGPU target synchronization scopes are inclusive, meaning a
   /// larger synchronization scope is inclusive of a smaller synchronization
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e172a9c699fb1..cbd6f64976d21 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1833,6 +1833,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return GFX1250Insts && getGeneration() == GFX12;
   }
 
+  /// \returns true if the subtarget supports clusters of workgroups.
+  bool hasClusters() const { return GFX1250Insts; }
+
   /// \returns true if the subtarget requires a wait for xcnt before atomic
   /// flat/global stores & rmw.
   bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0be6a9d09379f..1637c06936f9b 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -63,6 +63,7 @@ enum class SIAtomicScope {
   SINGLETHREAD,
   WAVEFRONT,
   WORKGROUP,
+  CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
   AGENT,
   SYSTEM
 };
@@ -106,6 +107,7 @@ class SIMemOpInfo final {
   bool IsCooperative = false;
 
   SIMemOpInfo(
+      const GCNSubtarget &ST,
       AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
       SIAtomicScope Scope = SIAtomicScope::SYSTEM,
       SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
@@ -156,6 +158,11 @@ class SIMemOpInfo final {
                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
     }
+
+    // On targets that have no concept of a workgroup cluster, use
+    // AGENT scope as a conservatively correct alternative.
+    if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
+      this->Scope = SIAtomicScope::AGENT;
   }
 
 public:
@@ -225,6 +232,7 @@ class SIMemOpInfo final {
 class SIMemOpAccess final {
 private:
   const AMDGPUMachineModuleInfo *MMI = nullptr;
+  const GCNSubtarget &ST;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -248,7 +256,7 @@ class SIMemOpAccess final {
 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
-  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
+  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
 
   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
@@ -773,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
     return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
   if (SSID == MMI->getAgentSSID())
     return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
+  if (SSID == MMI->getClusterSSID())
+    return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
   if (SSID == MMI->getWorkgroupSSID())
     return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
                       true);
@@ -788,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
   if (SSID == MMI->getAgentOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::AGENT,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
+  if (SSID == MMI->getClusterOneAddressSpaceSSID())
+    return std::tuple(SIAtomicScope::CLUSTER,
+                      SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
     return std::tuple(SIAtomicScope::WORKGROUP,
                       SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
@@ -815,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
   return SIAtomicAddrSpace::OTHER;
 }
 
-SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
-    : MMI(&MMI_) {}
+SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
+                             const GCNSubtarget &ST)
+    : MMI(&MMI_), ST(ST) {}
 
 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
@@ -877,7 +891,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
       return std::nullopt;
     }
   }
-  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+  return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
                      IsNonTemporal, IsLastUse, IsCooperative);
 }
@@ -891,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo();
+    return SIMemOpInfo(ST);
 
   return constructFromMIWithMMO(MI);
 }
@@ -905,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo();
+    return SIMemOpInfo(ST);
 
   return constructFromMIWithMMO(MI);
 }
@@ -946,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
   if (SynchronizeAS)
     OrderingAddrSpace = *SynchronizeAS;
 
-  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
-                     IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
+  return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
+                     SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
+                     AtomicOrdering::NotAtomic);
 }
 
 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -959,7 +974,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo();
+    return SIMemOpInfo(ST);
 
   return constructFromMIWithMMO(MI);
 }
@@ -2377,6 +2392,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
+    case SIAtomicScope::CLUSTER:
       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
         LOADCnt |= true;
       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2413,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     switch (Scope) {
     case SIAtomicScope::SYSTEM:
     case SIAtomicScope::AGENT:
+    case SIAtomicScope::CLUSTER:
     case SIAtomicScope::WORKGROUP:
       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
       // not needed as LDS operations for all waves are executed in a total
@@ -2495,6 +2512,9 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   case SIAtomicScope::AGENT:
     ScopeImm = AMDGPU::CPol::SCOPE_DEV;
     break;
+  case SIAtomicScope::CLUSTER:
+    ScopeImm = AMDGPU::CPol::SCOPE_SE;
+    break;
   case SIAtomicScope::WORKGROUP:
     // GFX12.0:
     //  In WGP mode the waves of a work-group can be executing on either CU of
@@ -2565,6 +2585,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
           .addImm(AMDGPU::CPol::SCOPE_DEV);
     }
     break;
+  case SIAtomicScope::CLUSTER:
   case SIAtomicScope::WORKGROUP:
     // No WB necessary, but we still have to wait.
     break;
@@ -2649,11 +2670,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
   const unsigned Scope = CPol->getImm() & CPol::SCOPE;
 
   // GFX12.0 only: Extra waits needed before system scope stores.
-  if (!ST.hasGFX1250Insts()) {
-    if (!Atomic && Scope == CPol::SCOPE_SYS)
-      return insertWaitsBeforeSystemScopeStore(MI);
-    return Changed;
-  }
+  if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
+    Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
 
   return Changed;
 }
@@ -2684,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
     case SIAtomicScope::AGENT:
       Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
       break;
+    case SIAtomicScope::CLUSTER:
+      Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
+      break;
     case SIAtomicScope::WORKGROUP:
       // In workgroup mode, SCOPE_SE is needed as waves can executes on
       // different CUs that access different L0s.
@@ -2930,8 +2951,8 @@ SIMemoryLegalizerPass::run(MachineFunction &MF,
 bool SIMemoryLegalizer::run(MachineFunction &MF) {
   bool Changed = false;
 
-  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
   CC = SICacheControl::create(ST);
 
   for (auto &MBB : MF) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 736a8b58466dd..d288bfc6a09db 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1632,6 +1632,801 @@ entry:
   ret void
 }
 
+
+define amdgpu_kernel void @cluster_acquire_fence() {
+; GFX6-LABEL: cluster_acquire_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_acquire_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_acquire_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_acquire_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_acquire_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_acquire_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_acquire_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_acquire_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_acquire_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_acquire_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_acquire_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_acquire_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster") acquire
+  ret void
+}
+
+define amdgpu_kernel void @cluster_release_fence() {
+; GFX6-LABEL: cluster_release_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_release_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_release_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_release_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_release_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_release_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_release_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_release_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_release_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_release_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_release_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_release_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster") release
+  ret void
+}
+
+define amdgpu_kernel void @cluster_acq_rel_fence() {
+; GFX6-LABEL: cluster_acq_rel_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_acq_rel_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_acq_rel_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_acq_rel_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_acq_rel_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_acq_rel_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_acq_rel_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_acq_rel_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_acq_rel_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_acq_rel_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_acq_rel_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_acq_rel_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @cluster_seq_cst_fence() {
+; GFX6-LABEL: cluster_seq_cst_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_seq_cst_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_seq_cst_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_seq_cst_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_seq_cst_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_seq_cst_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_seq_cst_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_seq_cst_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_seq_cst_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_seq_cst_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_seq_cst_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_seq_cst_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @cluster_one_as_acquire_fence() {
+; GFX6-LABEL: cluster_one_as_acquire_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_one_as_acquire_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_one_as_acquire_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_one_as_acquire_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_one_as_acquire_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_one_as_acquire_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_acquire_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_one_as_acquire_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_one_as_acquire_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_one_as_acquire_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_one_as_acquire_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_one_as_acquire_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_one_as_acquire_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster-one-as") acquire
+  ret void
+}
+
+define amdgpu_kernel void @cluster_one_as_release_fence() {
+; GFX6-LABEL: cluster_one_as_release_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_one_as_release_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_one_as_release_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_one_as_release_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_one_as_release_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_release_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_one_as_release_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_release_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_one_as_release_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_one_as_release_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_one_as_release_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_one_as_release_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_one_as_release_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_one_as_release_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster-one-as") release
+  ret void
+}
+
+define amdgpu_kernel void @cluster_one_as_acq_rel_fence() {
+; GFX6-LABEL: cluster_one_as_acq_rel_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_one_as_acq_rel_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_one_as_acq_rel_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_one_as_acq_rel_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_one_as_acq_rel_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_acq_rel_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_one_as_acq_rel_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_one_as_acq_rel_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_one_as_acq_rel_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_one_as_acq_rel_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_one_as_acq_rel_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_one_as_acq_rel_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster-one-as") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @cluster_one_as_seq_cst_fence() {
+; GFX6-LABEL: cluster_one_as_seq_cst_fence:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: cluster_one_as_seq_cst_fence:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: cluster_one_as_seq_cst_fence:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: cluster_one_as_seq_cst_fence:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: cluster_one_as_seq_cst_fence:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: cluster_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_seq_cst_fence:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: cluster_one_as_seq_cst_fence:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: cluster_one_as_seq_cst_fence:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: cluster_one_as_seq_cst_fence:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: cluster_one_as_seq_cst_fence:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: cluster_one_as_seq_cst_fence:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: cluster_one_as_seq_cst_fence:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+entry:
+  fence syncscope("cluster-one-as") seq_cst
+  ret void
+}
+
 define amdgpu_kernel void @agent_acquire_fence() {
 ; GFX6-LABEL: agent_acquire_fence:
 ; GFX6:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
new file mode 100644
index 0000000000000..9ea9f1125c726
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
@@ -0,0 +1,25726 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @flat_cluster_unordered_load(
+; GFX7-LABEL: flat_cluster_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster") unordered, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_load(
+; GFX7-LABEL: flat_cluster_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster") monotonic, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_load(
+; GFX7-LABEL: flat_cluster_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster") acquire, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_load(
+; GFX7-LABEL: flat_cluster_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster") seq_cst, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_unordered_store(
+; GFX7-LABEL: flat_cluster_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_store(
+; GFX7-LABEL: flat_cluster_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_store(
+; GFX7-LABEL: flat_cluster_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_store(
+; GFX7-LABEL: flat_cluster_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw(
+; GFX7-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_atomicrmw(
+; GFX7-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_atomicrmw(
+; GFX7-LABEL: flat_cluster_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") release
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw(
+; GFX7-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw(
+; GFX7-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw(
+; GFX7-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acquire
+  store i32 %val, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw(
+; GFX7-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acq_rel
+  store i32 %val, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw(
+; GFX7-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") seq_cst
+  store i32 %val, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_unordered_load(
+; GFX7-LABEL: flat_cluster_one_as_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster-one-as") unordered, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_load(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster-one-as") monotonic, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_load(
+; GFX7-LABEL: flat_cluster_one_as_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster-one-as") acquire, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_load_dword v2, v[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %in, ptr %out) {
+entry:
+  %val = load atomic i32, ptr %in syncscope("cluster-one-as") seq_cst, align 4
+  store i32 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_unordered_store(
+; GFX7-LABEL: flat_cluster_one_as_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster-one-as") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_store(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster-one-as") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_store(
+; GFX7-LABEL: flat_cluster_one_as_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster-one-as") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2 sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr %out) {
+entry:
+  store atomic i32 %in, ptr %out syncscope("cluster-one-as") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") release
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acquire
+  store i32 %val, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acq_rel
+  store i32 %val, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") seq_cst
+  store i32 %val, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s4, s8
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s9
+; GFX10-WGP-NEXT:    s_mov_b32 s9, s10
+; GFX10-WGP-NEXT:    s_mov_b32 s8, s11
+; GFX10-WGP-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-WGP-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-WGP-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-WGP-NEXT:    s_mov_b32 s5, s8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[10:11], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s4, s8
+; GFX10-CU-NEXT:    s_mov_b32 s5, s9
+; GFX10-CU-NEXT:    s_mov_b32 s9, s10
+; GFX10-CU-NEXT:    s_mov_b32 s8, s11
+; GFX10-CU-NEXT:    s_add_u32 s4, s4, s9
+; GFX10-CU-NEXT:    s_addc_u32 s8, s5, s8
+; GFX10-CU-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX10-CU-NEXT:    s_mov_b32 s5, s8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s3, s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[6:7], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s7
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s0, s0, s5
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s4, s1, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s4
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX7-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-WGP-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-WGP-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_mov_b32 s6, s4
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s5
+; GFX10-WGP-NEXT:    s_mov_b32 s11, s12
+; GFX10-WGP-NEXT:    s_mov_b32 s10, s13
+; GFX10-WGP-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-WGP-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-WGP-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-WGP-NEXT:    s_mov_b32 s7, s10
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-CU-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
+; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
+; GFX10-CU-NEXT:    s_mov_b64 s[12:13], 16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_mov_b32 s6, s4
+; GFX10-CU-NEXT:    s_mov_b32 s7, s5
+; GFX10-CU-NEXT:    s_mov_b32 s11, s12
+; GFX10-CU-NEXT:    s_mov_b32 s10, s13
+; GFX10-CU-NEXT:    s_add_u32 s6, s6, s11
+; GFX10-CU-NEXT:    s_addc_u32 s10, s7, s10
+; GFX10-CU-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX10-CU-NEXT:    s_mov_b32 s7, s10
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[8:9], 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, s7
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s6, s3, s6
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, v0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-TGSPLIT-NEXT:    flat_store_dword v[0:1], v2
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr %out, i32 4
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
new file mode 100644
index 0000000000000..7c0a2ad5cdb78
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
@@ -0,0 +1,23810 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @global_cluster_unordered_load(
+; GFX6-LABEL: global_cluster_unordered_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") unordered, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_load(
+; GFX6-LABEL: global_cluster_monotonic_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") monotonic, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_load(
+; GFX6-LABEL: global_cluster_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") acquire, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_load(
+; GFX6-LABEL: global_cluster_seq_cst_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") seq_cst, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_unordered_store(
+; GFX6-LABEL: global_cluster_unordered_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_store(
+; GFX6-LABEL: global_cluster_monotonic_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_store(
+; GFX6-LABEL: global_cluster_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_store(
+; GFX6-LABEL: global_cluster_seq_cst_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_atomicrmw(
+; GFX6-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_atomicrmw(
+; GFX6-LABEL: global_cluster_acquire_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_atomicrmw(
+; GFX6-LABEL: global_cluster_release_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") release
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw(
+; GFX6-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw(
+; GFX6-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw(
+; GFX6-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acquire
+  store i32 %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acq_rel
+  store i32 %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") seq_cst
+  store i32 %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_unordered_load(
+; GFX6-LABEL: global_cluster_one_as_unordered_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") unordered, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_load(
+; GFX6-LABEL: global_cluster_one_as_monotonic_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") monotonic, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_load(
+; GFX6-LABEL: global_cluster_one_as_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") acquire, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_load(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, s9
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX6-NEXT:    s_mov_b32 s12, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s13, -1
+; GFX6-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; GFX6-NEXT:    s_mov_b32 s9, s6
+; GFX6-NEXT:    s_mov_b32 s10, s13
+; GFX6-NEXT:    s_mov_b32 s11, s12
+; GFX6-NEXT:    s_mov_b32 s14, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s14
+; GFX6-NEXT:    s_mov_b32 s6, s13
+; GFX6-NEXT:    s_mov_b32 s7, s12
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s5
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s9, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s10, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s10
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s9
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s8
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_load_dword v1, v0, s[2:3] sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") seq_cst, align 4
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_unordered_store(
+; GFX6-LABEL: global_cluster_one_as_unordered_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_store(
+; GFX6-LABEL: global_cluster_one_as_monotonic_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_release_store(
+; GFX6-LABEL: global_cluster_one_as_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_store(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(1) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") release
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acquire
+  store i32 %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acq_rel
+  store i32 %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[8:9], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s11
+; GFX6-NEXT:    s_mov_b32 s6, s10
+; GFX6-NEXT:    s_mov_b32 s7, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[0:3], 0 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") seq_cst
+  store i32 %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x3
+; GFX7-NEXT:    s_mov_b64 s[10:11], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s8
+; GFX7-NEXT:    s_mov_b32 s5, s9
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    s_mov_b32 s8, s11
+; GFX7-NEXT:    s_add_u32 s4, s4, s9
+; GFX7-NEXT:    s_addc_u32 s8, s5, s8
+; GFX7-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX7-NEXT:    s_mov_b32 s5, s8
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s12, s5
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GFX6-NEXT:    s_mov_b32 s10, 0x100f000
+; GFX6-NEXT:    s_mov_b32 s11, -1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX6-NEXT:    s_mov_b32 s5, s12
+; GFX6-NEXT:    s_mov_b32 s6, s11
+; GFX6-NEXT:    s_mov_b32 s7, s10
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    v_mov_b32_e32 v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT:    s_add_i32 s12, s12, s17
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, s4
+; GFX7-NEXT:    s_mov_b32 s7, s5
+; GFX7-NEXT:    s_mov_b32 s11, s12
+; GFX7-NEXT:    s_mov_b32 s10, s13
+; GFX7-NEXT:    s_add_u32 s6, s6, s11
+; GFX7-NEXT:    s_addc_u32 s10, s7, s10
+; GFX7-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX7-NEXT:    s_mov_b32 s7, s10
+; GFX7-NEXT:    v_mov_b32_e32 v2, s9
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl1_inv
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
+; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_gl1_inv
+; GFX10-CU-NEXT:    buffer_gl0_inv
+; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s5, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x3
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s8, s1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, 0xf000
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, -1
+; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s8
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s7
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s6
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s5
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s4
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, v2
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0xc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NOTTGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-TGSPLIT-NEXT:    buffer_wbl2 sc1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc1
+; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl1_inv
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    buffer_gl1_inv
+; GFX11-CU-NEXT:    buffer_gl0_inv
+; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(1) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
new file mode 100644
index 0000000000000..8926893c68dbc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -0,0 +1,18467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @local_cluster_unordered_load(
+; GFX6-LABEL: local_cluster_unordered_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") unordered, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_load(
+; GFX6-LABEL: local_cluster_monotonic_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") monotonic, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_load(
+; GFX6-LABEL: local_cluster_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") acquire, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_load(
+; GFX6-LABEL: local_cluster_seq_cst_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") seq_cst, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_unordered_store(
+; GFX6-LABEL: local_cluster_unordered_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_store(
+; GFX6-LABEL: local_cluster_monotonic_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_store(
+; GFX6-LABEL: local_cluster_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_store(
+; GFX6-LABEL: local_cluster_seq_cst_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_atomicrmw(
+; GFX6-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_atomicrmw(
+; GFX6-LABEL: local_cluster_acquire_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_atomicrmw(
+; GFX6-LABEL: local_cluster_release_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") release
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
+; GFX6-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
+; GFX6-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw(
+; GFX6-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acquire
+  store i32 %val, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acq_rel
+  store i32 %val, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") seq_cst
+  store i32 %val, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    buffer_gl0_inv
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    buffer_inv sc0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    buffer_gl0_inv
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
+; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_unordered_load(
+; GFX6-LABEL: local_cluster_one_as_unordered_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") unordered, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_load(
+; GFX6-LABEL: local_cluster_one_as_monotonic_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") monotonic, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_load(
+; GFX6-LABEL: local_cluster_one_as_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") acquire, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_load(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr4
+; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr5
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    ds_read_b32 v1, v0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    ds_read_b32 v1, v0
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    ds_read_b32 v1, v0
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    ds_read_b32 v1, v0
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    ds_load_b32 v1, v0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    ds_load_b32 v1, v0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    ds_load_b32 v1, v0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    ds_load_b32 v1, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") seq_cst, align 4
+  store i32 %val, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_unordered_store(
+; GFX6-LABEL: local_cluster_one_as_unordered_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_store(
+; GFX6-LABEL: local_cluster_one_as_monotonic_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_store(
+; GFX6-LABEL: local_cluster_one_as_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_store(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(3) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") release
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acquire
+  store i32 %val, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acq_rel
+  store i32 %val, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") seq_cst
+  store i32 %val, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr6
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-WGP-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    ; kill: def $sgpr5 killed $sgpr4
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v0, v1
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-WGP-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    ds_write_b32 v0, v1
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s5
+; GFX90A-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NOTTGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-TGSPLIT-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    ds_store_b32 v0, v1
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-WGP-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
+; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v0, v1
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v0, v1
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(3) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(3) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(3) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll
new file mode 100644
index 0000000000000..2af195461d2eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll
@@ -0,0 +1,23347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @private_cluster_unordered_load(
+; GFX6-LABEL: private_cluster_unordered_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") unordered, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_load(
+; GFX6-LABEL: private_cluster_monotonic_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") monotonic, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_load(
+; GFX6-LABEL: private_cluster_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") acquire, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_load(
+; GFX6-LABEL: private_cluster_seq_cst_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") seq_cst, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_unordered_store(
+; GFX6-LABEL: private_cluster_unordered_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_store(
+; GFX6-LABEL: private_cluster_monotonic_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_store(
+; GFX6-LABEL: private_cluster_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_store(
+; GFX6-LABEL: private_cluster_seq_cst_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_atomicrmw(
+; GFX6-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_atomicrmw(
+; GFX6-LABEL: private_cluster_acquire_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_atomicrmw(
+; GFX6-LABEL: private_cluster_release_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") release
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw(
+; GFX6-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw(
+; GFX6-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acquire
+  store i32 %val, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acq_rel
+  store i32 %val, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") seq_cst
+  store i32 %val, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_unordered_load(
+; GFX6-LABEL: private_cluster_one_as_unordered_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_unordered_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_unordered_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_unordered_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_unordered_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_unordered_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_unordered_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_unordered_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_unordered_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_unordered_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_unordered_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_unordered_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") unordered, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_load(
+; GFX6-LABEL: private_cluster_one_as_monotonic_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") monotonic, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_load(
+; GFX6-LABEL: private_cluster_one_as_acquire_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") acquire, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_load(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_load:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_load:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+  %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") seq_cst, align 4
+  store i32 %val, ptr addrspace(5) %out
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_unordered_store(
+; GFX6-LABEL: private_cluster_one_as_unordered_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_unordered_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_unordered_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_unordered_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_unordered_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_unordered_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_unordered_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_unordered_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_unordered_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_unordered_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_unordered_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_unordered_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_store(
+; GFX6-LABEL: private_cluster_one_as_monotonic_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_release_store(
+; GFX6-LABEL: private_cluster_one_as_release_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_release_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_release_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_release_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_release_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_release_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_release_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_release_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_release_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_store(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_store:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_store:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    i32 %in, ptr addrspace(5) %out) {
+entry:
+  store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_release_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") release
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acquire
+  store i32 %val, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acq_rel
+  store i32 %val, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in) {
+entry:
+  %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") seq_cst
+  store i32 %val, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT:    s_mov_b32 s7, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s4, s4, s7
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT:    s_mov_b32 s7, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s4, s4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s3, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_add_u32 s0, s0, s15
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT:    s_mov_b32 s5, 16
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_add_i32 s5, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s5
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v2, s5
+; GFX6-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_u32 s0, s0, s17
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT:    s_mov_b32 s5, 16
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s5, s4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s5
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT:    s_endpgm
+;
+; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT:    s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV:       ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT:    s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT:    s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT:    s_nop 0
+; GFX942-NOTTGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT:    s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT:       ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT:    s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT:    scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT:    s_nop 0
+; GFX942-TGSPLIT-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT:    s_endpgm
+;
+; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP:       ; %bb.0: ; %entry
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_mov_b32 s1, 16
+; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT:    s_endpgm
+;
+; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_mov_b32 s1, 16
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT:    scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT:    scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT:    s_endpgm
+;
+; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP:       ; %bb.0: ; %entry
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT:    s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT:    s_endpgm
+;
+; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT:    s_wait_alu 0xf1ff
+; GFX12-CU-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT:    scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 off, v0, s0
+; GFX1250-NEXT:    s_endpgm
+    ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+  %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+  %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst
+  %val0 = extractvalue { i32, i1 } %val, 0
+  store i32 %val0, ptr addrspace(5) %out, align 4
+  ret void
+}



More information about the llvm-commits mailing list