[llvm] [AMDGPU] Adjust workgroup fence lowering for single-wave workgroups (PR #187673)

Barbara Mitic via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 30 07:50:11 PDT 2026


https://github.com/barbara-amd updated https://github.com/llvm/llvm-project/pull/187673

>From 481bd1a3cf986e93f5ebe01b4233687077ecce28 Mon Sep 17 00:00:00 2001
From: bmitic_amdeng <Barbara.Mitic at amd.com>
Date: Fri, 20 Mar 2026 11:32:37 +0100
Subject: [PATCH 1/3] [AMDGPU] Relax workgroup fences for single-wave
 workgroups

When the whole workgroup fits in a single wavefront (i.e. at most one
wave), a fence syncscope("workgroup") can be lowered to
syncscope("wavefront"): there are no other waves in the workgroup for
that fence to synchronize across, so wavefront scope is sufficient.
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 20 +++++++++
 .../codegen-prepare-relax-workgroup-fence.ll  | 42 +++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8a80101d79a8e..380cf7cb56e0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -256,6 +256,7 @@ class AMDGPUCodeGenPrepareImpl
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
+  bool visitFenceInst(FenceInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
@@ -1998,6 +1999,25 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   return true;
 }
 
+// When all waves of the workgroup fit in one wave, workgroup fences can be
+// lowered to wavefront scope.
+bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
+  unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+  if (WGMaxSize > ST.getWavefrontSize())
+    return false;
+
+  SyncScope::ID WorkgroupSSID =
+      F.getContext().getOrInsertSyncScopeID("workgroup");
+  SyncScope::ID WavefrontSSID =
+      F.getContext().getOrInsertSyncScopeID("wavefront");
+
+  if (I.getSyncScopeID() != WorkgroupSSID)
+    return false;
+
+  I.setSyncScopeID(WavefrontSSID);
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   Intrinsic::ID IID = I.getIntrinsicID();
   switch (IID) {
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
new file mode 100644
index 0000000000000..b8aecf451e31c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    fence syncscope("wavefront") acq_rel
+; CHECK-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
+; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE32-NEXT:    fence syncscope("workgroup") acq_rel
+; WAVE32-NEXT:    ret void
+;
+; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE64-NEXT:    fence syncscope("wavefront") acq_rel
+; WAVE64-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
+; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    fence syncscope("workgroup") acq_rel
+; CHECK-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }

>From e74c69993cb6efed9f708582b3b13f82eca1ceb4 Mon Sep 17 00:00:00 2001
From: barbara-amd <Barbara.Mitic at amd.com>
Date: Tue, 24 Mar 2026 13:47:04 +0100
Subject: [PATCH 2/3] [AMDGPU] Refine single-wave workgroup fence lowering

Keep workgroup scope on the fence; when the workgroup is known to fit in a
single wave, use wavefront scope only for the wait sequence emitted by the
legalizer.
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 20 --------
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  | 22 ++++++---
 .../codegen-prepare-relax-workgroup-fence.ll  | 42 ----------------
 ...y-legalizer-single-wave-workgroup-fence.ll | 49 +++++++++++++++++++
 4 files changed, 65 insertions(+), 68 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 380cf7cb56e0c..8a80101d79a8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -256,7 +256,6 @@ class AMDGPUCodeGenPrepareImpl
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
-  bool visitFenceInst(FenceInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
@@ -1999,25 +1998,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   return true;
 }
 
-// When all waves of the workgroup fit in one wave, workgroup fences can be
-// lowered to wavefront scope.
-bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
-  unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
-  if (WGMaxSize > ST.getWavefrontSize())
-    return false;
-
-  SyncScope::ID WorkgroupSSID =
-      F.getContext().getOrInsertSyncScopeID("workgroup");
-  SyncScope::ID WavefrontSSID =
-      F.getContext().getOrInsertSyncScopeID("wavefront");
-
-  if (I.getSyncScopeID() != WorkgroupSSID)
-    return false;
-
-  I.setSyncScopeID(WavefrontSSID);
-  return true;
-}
-
 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   Intrinsic::ID IID = I.getIntrinsicID();
   switch (IID) {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index e5f352a3ed110..6b694291854fe 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -592,7 +592,7 @@ class SIMemoryLegalizer final {
                    MachineBasicBlock::iterator &MI);
   /// Expands atomic fence operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicFence(const SIMemOpInfo &MOI,
+  bool expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
                          MachineBasicBlock::iterator &MI);
   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
@@ -2326,7 +2326,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   return Changed;
 }
 
-bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
+bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
@@ -2335,11 +2335,21 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
 
   const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
 
+  // When all waves of the workgroup fit in one wave, workgroup fences can be
+  // lowered to wavefront scope.
+  SIAtomicScope ScopeForFence = MOI.getScope();
+  if (ScopeForFence == SIAtomicScope::WORKGROUP) {
+    const Function &F = MI->getMF()->getFunction();
+    const unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+    if (WGMaxSize <= ST.getWavefrontSize())
+      ScopeForFence = SIAtomicScope::WAVEFRONT;
+  }
+
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Acquire) {
       // Acquire fences only need to wait on the previous atomic they pair with.
-      Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+      Changed |= CC->insertWait(MI, ScopeForFence, OrderingAddrSpace,
                                 SIMemOp::LOAD | SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
                                 Position::BEFORE, Order, /*AtomicsOnly=*/true);
@@ -2355,7 +2365,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
       /// generate a fence. Could add support in this file for
       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
       /// adding S_WAITCNT before a S_BARRIER.
-      Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
+      Changed |= CC->insertRelease(MI, ScopeForFence, OrderingAddrSpace,
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);
 
@@ -2367,7 +2377,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
+        Changed |= CC->insertAcquire(MI, ScopeForFence, OrderingAddrSpace,
                                    Position::BEFORE);
 
     return Changed;
@@ -2492,7 +2502,7 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
           Changed |= expandLDSDMA(*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
-          Changed |= expandAtomicFence(*MOI, MI);
+          Changed |= expandAtomicFence(ST,*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
           Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
       }
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
deleted file mode 100644
index b8aecf451e31c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
-
-define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
-; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    fence syncscope("wavefront") acq_rel
-; CHECK-NEXT:    ret void
-;
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
-; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
-; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
-; WAVE32-NEXT:    fence syncscope("workgroup") acq_rel
-; WAVE32-NEXT:    ret void
-;
-; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
-; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
-; WAVE64-NEXT:    fence syncscope("wavefront") acq_rel
-; WAVE64-NEXT:    ret void
-;
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
-; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    fence syncscope("workgroup") acq_rel
-; CHECK-NEXT:    ret void
-;
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
-attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
-attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
new file mode 100644
index 0000000000000..151d326d5cb73
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup32() #0 {
+  ; CHECK-LABEL: name: single_wave_workgroup32
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64() #1 {
+  ; WAVE32-LABEL: name: single_wave_workgroup64
+  ; WAVE32: bb.0 (%ir-block.0):
+  ; WAVE32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; WAVE32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; WAVE32-NEXT:   S_ENDPGM 0
+  ;
+  ; WAVE64-LABEL: name: single_wave_workgroup64
+  ; WAVE64: bb.0 (%ir-block.0):
+  ; WAVE64-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup() #2 {
+  ; CHECK-LABEL: name: multi_wave_workgroup
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_STORECNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_DSCNT_soft 0
+  ; CHECK-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
+

>From a21ebafea093a01457e48d710440e7ecf756afa6 Mon Sep 17 00:00:00 2001
From: barbara-amd <Barbara.Mitic at amd.com>
Date: Mon, 30 Mar 2026 16:47:27 +0200
Subject: [PATCH 3/3] [AMDGPU] Demote workgroup atomics to wavefront for
 single-wave work-groups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the kernel’s maximum flat work-group size is at most the wavefront
size, workgroup-scoped synchronization matches wavefront scope.
SIMemoryLegalizer applies that demotion in SIMemOpInfo for fences and for
atomic loads, stores, atomicrmw, and cmpxchg whose ordering is non-relaxed.
---
 llvm/docs/AMDGPUUsage.rst                     |   13 +
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |   44 +-
 .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll |  184 +-
 .../CodeGen/AMDGPU/global-saddr-atomics.ll    |  672 ++--
 ...y-legalizer-single-wave-workgroup-fence.ll |   49 -
 ...-legalizer-single-wave-workgroup-memops.ll | 2759 +++++++++++++++++
 6 files changed, 3053 insertions(+), 668 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 1ede5ca2d4cf6..6d73a4b532c5e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -7141,6 +7141,19 @@ treated as non-atomic.
 A memory synchronization scope wider than work-group is not meaningful for the
 group (LDS) address space and is treated as work-group.
 
+When a work-group's maximum flat work-group size does not exceed the wavefront
+size, the work-group fits within a single wavefront. In this case, LLVM
+``workgroup`` synchronization scope is equivalent to ``wavefront`` scope.
+
+If the compiler can determine this bound (e.g., via ``amdgpu-flat-work-group-size``),
+the AMDGPU backend optimizes ``workgroup`` scope operations by lowering them to
+``wavefront``-scoped machine instructions.
+
+It applies to atomic ``load``, ``store``, ``atomicrmw``, and ``cmpxchg``
+instructions, and to ``fence`` instructions, when they use synchronizing memory
+orderings (``acquire``, ``release``, ``acq_rel``, or ``seq_cst``).
+
+
 The memory model does not support the region address space which is treated as
 non-atomic.
 
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6b694291854fe..53c10d6700c5a 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -117,7 +117,8 @@ class SIMemOpInfo final {
       bool IsCrossAddressSpaceOrdering = true,
       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
       bool IsVolatile = false, bool IsNonTemporal = false,
-      bool IsLastUse = false, bool IsCooperative = false)
+      bool IsLastUse = false, bool IsCooperative = false,
+      const Function *ScopeDemotionFn = nullptr)
       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
@@ -165,6 +166,19 @@ class SIMemOpInfo final {
     // AGENT scope as a conservatively correct alternative.
     if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
       this->Scope = SIAtomicScope::AGENT;
+
+    // When max flat work-group size is at most the wavefront size, the
+    // work-group fits in a single wave, so LLVM workgroup scope matches
+    // wavefront scope. Demote workgroup → wavefront here for fences and for
+    // atomics with ordering stronger than monotonic.
+    if (ScopeDemotionFn && this->Scope == SIAtomicScope::WORKGROUP &&
+        (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
+         llvm::isStrongerThan(this->FailureOrdering,
+                              AtomicOrdering::Monotonic)) &&
+        ST.getFlatWorkGroupSizes(*ScopeDemotionFn).second <=
+            ST.getWavefrontSize()) {
+      this->Scope = SIAtomicScope::WAVEFRONT;
+    }
   }
 
 public:
@@ -592,7 +606,7 @@ class SIMemoryLegalizer final {
                    MachineBasicBlock::iterator &MI);
   /// Expands atomic fence operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
+  bool expandAtomicFence(const SIMemOpInfo &MOI,
                          MachineBasicBlock::iterator &MI);
   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
@@ -809,7 +823,8 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   }
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
-                     IsNonTemporal, IsLastUse, IsCooperative);
+                     IsNonTemporal, IsLastUse, IsCooperative,
+                     &MI->getMF()->getFunction());
 }
 
 std::optional<SIMemOpInfo>
@@ -878,7 +893,8 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
 
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
                      SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
-                     AtomicOrdering::NotAtomic);
+                     AtomicOrdering::NotAtomic, false, false, false, false,
+                     &MI->getMF()->getFunction());
 }
 
 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -2326,7 +2342,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   return Changed;
 }
 
-bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
+bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
@@ -2335,21 +2351,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
 
   const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
 
-  // When all waves of the workgroup fit in one wave, workgroup fences can be
-  // lowered to wavefront scope.
-  SIAtomicScope ScopeForFence = MOI.getScope();
-  if (ScopeForFence == SIAtomicScope::WORKGROUP) {
-    const Function &F = MI->getMF()->getFunction();
-    const unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
-    if (WGMaxSize <= ST.getWavefrontSize())
-      ScopeForFence = SIAtomicScope::WAVEFRONT;
-  }
-
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Acquire) {
       // Acquire fences only need to wait on the previous atomic they pair with.
-      Changed |= CC->insertWait(MI, ScopeForFence, OrderingAddrSpace,
+      Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
                                 SIMemOp::LOAD | SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
                                 Position::BEFORE, Order, /*AtomicsOnly=*/true);
@@ -2365,7 +2371,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
       /// generate a fence. Could add support in this file for
       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
       /// adding S_WAITCNT before a S_BARRIER.
-      Changed |= CC->insertRelease(MI, ScopeForFence, OrderingAddrSpace,
+      Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);
 
@@ -2377,7 +2383,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent)
-        Changed |= CC->insertAcquire(MI, ScopeForFence, OrderingAddrSpace,
+        Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
                                    Position::BEFORE);
 
     return Changed;
@@ -2502,7 +2508,7 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
           Changed |= expandLDSDMA(*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
-          Changed |= expandAtomicFence(ST,*MOI, MI);
+          Changed |= expandAtomicFence(*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
           Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
       }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 275825a973415..9d0d43d900026 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -5916,7 +5916,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn:
@@ -5925,7 +5924,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_smax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn:
@@ -5935,7 +5933,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -5948,7 +5945,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128:
@@ -5960,7 +5956,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_smax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128:
@@ -5973,7 +5968,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6002,16 +5996,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX1250-SDAG-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB58_5
 ; GFX1250-SDAG-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-SDAG-NEXT:  .LBB58_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6045,16 +6041,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX1250-GISEL-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB58_5
 ; GFX1250-GISEL-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-GISEL-NEXT:  .LBB58_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6084,11 +6082,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX950-SDAG-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB58_5
 ; GFX950-SDAG-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6097,6 +6094,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
@@ -6128,11 +6126,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX950-GISEL-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB58_5
 ; GFX950-GISEL-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6141,6 +6138,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
@@ -6182,16 +6180,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX1250-SDAG-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB59_5
 ; GFX1250-SDAG-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-SDAG-NEXT:  .LBB59_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6228,16 +6228,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX1250-GISEL-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB59_5
 ; GFX1250-GISEL-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-GISEL-NEXT:  .LBB59_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6270,11 +6272,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX950-SDAG-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB59_5
 ; GFX950-SDAG-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6283,6 +6284,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
@@ -6317,11 +6319,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX950-GISEL-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB59_5
 ; GFX950-GISEL-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6330,6 +6331,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
@@ -6372,7 +6374,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB60_2
@@ -6410,7 +6411,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB60_2
@@ -6443,7 +6443,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB60_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6480,7 +6479,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB60_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6528,7 +6526,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB61_2
@@ -6569,7 +6566,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB61_2
@@ -6605,7 +6601,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB61_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6646,7 +6641,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB61_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6752,7 +6746,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn:
@@ -6761,7 +6754,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_smin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn:
@@ -6771,7 +6763,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6784,7 +6775,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128:
@@ -6796,7 +6786,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_smin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128:
@@ -6809,7 +6798,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6838,16 +6826,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX1250-SDAG-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB66_5
 ; GFX1250-SDAG-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-SDAG-NEXT:  .LBB66_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6881,16 +6871,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX1250-GISEL-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB66_5
 ; GFX1250-GISEL-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-GISEL-NEXT:  .LBB66_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6920,11 +6912,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX950-SDAG-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB66_5
 ; GFX950-SDAG-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6933,6 +6924,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
@@ -6964,11 +6956,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX950-GISEL-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB66_5
 ; GFX950-GISEL-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6977,6 +6968,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
@@ -7018,16 +7010,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX1250-SDAG-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB67_5
 ; GFX1250-SDAG-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-SDAG-NEXT:  .LBB67_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7064,16 +7058,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX1250-GISEL-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB67_5
 ; GFX1250-GISEL-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-GISEL-NEXT:  .LBB67_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7106,11 +7102,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX950-SDAG-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB67_5
 ; GFX950-SDAG-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7119,6 +7114,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
@@ -7153,11 +7149,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX950-GISEL-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB67_5
 ; GFX950-GISEL-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7166,6 +7161,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
@@ -7208,7 +7204,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB68_2
@@ -7246,7 +7241,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB68_2
@@ -7279,7 +7273,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB68_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7316,7 +7309,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB68_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7364,7 +7356,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB69_2
@@ -7405,7 +7396,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB69_2
@@ -7441,7 +7431,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB69_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7482,7 +7471,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB69_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7588,7 +7576,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn:
@@ -7597,7 +7584,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_umax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn:
@@ -7607,7 +7593,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -7620,7 +7605,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128:
@@ -7632,7 +7616,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_umax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128:
@@ -7645,7 +7628,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -7674,16 +7656,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX1250-SDAG-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB74_5
 ; GFX1250-SDAG-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-SDAG-NEXT:  .LBB74_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7717,16 +7701,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX1250-GISEL-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB74_5
 ; GFX1250-GISEL-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-GISEL-NEXT:  .LBB74_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7756,11 +7742,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX950-SDAG-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB74_5
 ; GFX950-SDAG-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7769,6 +7754,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
@@ -7800,11 +7786,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX950-GISEL-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB74_5
 ; GFX950-GISEL-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7813,6 +7798,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
@@ -7854,16 +7840,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX1250-SDAG-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB75_5
 ; GFX1250-SDAG-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-SDAG-NEXT:  .LBB75_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7900,16 +7888,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX1250-GISEL-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB75_5
 ; GFX1250-GISEL-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-GISEL-NEXT:  .LBB75_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7942,11 +7932,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX950-SDAG-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB75_5
 ; GFX950-SDAG-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7955,6 +7944,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
@@ -7989,11 +7979,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX950-GISEL-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB75_5
 ; GFX950-GISEL-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8002,6 +7991,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8044,7 +8034,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB76_2
@@ -8082,7 +8071,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB76_2
@@ -8115,7 +8103,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB76_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8152,7 +8139,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB76_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8200,7 +8186,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB77_2
@@ -8241,7 +8226,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB77_2
@@ -8277,7 +8261,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB77_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8318,7 +8301,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB77_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8424,7 +8406,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn:
@@ -8433,7 +8414,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_umin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn:
@@ -8443,7 +8423,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -8456,7 +8435,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128:
@@ -8468,7 +8446,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_umin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128:
@@ -8481,7 +8458,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -8510,16 +8486,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX1250-SDAG-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB82_5
 ; GFX1250-SDAG-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-SDAG-NEXT:  .LBB82_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8553,16 +8531,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX1250-GISEL-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB82_5
 ; GFX1250-GISEL-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-GISEL-NEXT:  .LBB82_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8592,11 +8572,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX950-SDAG-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB82_5
 ; GFX950-SDAG-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8605,6 +8584,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
@@ -8636,11 +8616,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX950-GISEL-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB82_5
 ; GFX950-GISEL-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8649,6 +8628,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8690,16 +8670,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX1250-SDAG-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB83_5
 ; GFX1250-SDAG-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-SDAG-NEXT:  .LBB83_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8736,16 +8718,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX1250-GISEL-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB83_5
 ; GFX1250-GISEL-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-GISEL-NEXT:  .LBB83_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8778,11 +8762,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX950-SDAG-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB83_5
 ; GFX950-SDAG-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8791,6 +8774,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
@@ -8825,11 +8809,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX950-GISEL-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB83_5
 ; GFX950-GISEL-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8838,6 +8821,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8880,7 +8864,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB84_2
@@ -8918,7 +8901,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB84_2
@@ -8951,7 +8933,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB84_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8988,7 +8969,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB84_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -9036,7 +9016,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB85_2
@@ -9077,7 +9056,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB85_2
@@ -9113,7 +9091,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB85_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -9154,7 +9131,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB85_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 58f7c4340276d..466d7152a0d84 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -2142,31 +2142,22 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2176,31 +2167,22 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2211,30 +2193,19 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2243,30 +2214,19 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2276,31 +2236,22 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2310,31 +2261,22 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
 }
 
 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2345,30 +2287,19 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
 }
 
 define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2377,30 +2308,19 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2414,31 +2334,22 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2448,31 +2359,22 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2483,30 +2385,19 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2515,30 +2406,19 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2548,31 +2428,22 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2582,31 +2453,22 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
 }
 
 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2617,30 +2479,19 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
 }
 
 define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2649,30 +2500,19 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2686,31 +2526,22 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2720,31 +2551,22 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2755,30 +2577,19 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2787,30 +2598,19 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2820,31 +2620,22 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2854,31 +2645,22 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2889,30 +2671,19 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
 }
 
 define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2921,30 +2692,19 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2958,31 +2718,22 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2992,31 +2743,22 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3027,30 +2769,19 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3059,30 +2790,19 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3092,31 +2812,22 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3126,31 +2837,22 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3161,30 +2863,19 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
 }
 
 define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3193,30 +2884,19 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
deleted file mode 100644
index 151d326d5cb73..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE64
-
-define amdgpu_kernel void @single_wave_workgroup32() #0 {
-  ; CHECK-LABEL: name: single_wave_workgroup32
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   S_ENDPGM 0
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @single_wave_workgroup64() #1 {
-  ; WAVE32-LABEL: name: single_wave_workgroup64
-  ; WAVE32: bb.0 (%ir-block.0):
-  ; WAVE32-NEXT:   S_WAIT_BVHCNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_SAMPLECNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_LOADCNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_STORECNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_DSCNT_soft 0
-  ; WAVE32-NEXT:   GLOBAL_INV 8, implicit $exec
-  ; WAVE32-NEXT:   S_ENDPGM 0
-  ;
-  ; WAVE64-LABEL: name: single_wave_workgroup64
-  ; WAVE64: bb.0 (%ir-block.0):
-  ; WAVE64-NEXT:   S_ENDPGM 0
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @multi_wave_workgroup() #2 {
-  ; CHECK-LABEL: name: multi_wave_workgroup
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   S_WAIT_BVHCNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_SAMPLECNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_LOADCNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_STORECNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_DSCNT_soft 0
-  ; CHECK-NEXT:   GLOBAL_INV 8, implicit $exec
-  ; CHECK-NEXT:   S_ENDPGM 0
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
-attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
-attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
new file mode 100644
index 0000000000000..aaa295992c361
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
@@ -0,0 +1,2759 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX1250
+
+
+define amdgpu_kernel void @wg_fence_acq_rel_single32() #0 {
+  ; GFX9-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_acq_rel_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_acq_rel_multi() #2 {
+  ; GFX9-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_acquire_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acquire
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_release_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_release_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_release_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_release_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_release_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_release_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_release_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_release_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") release
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_seq_cst_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_single32(ptr addrspace(1) %p) #0 {
+  ; GFX9-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 {
+  ; GFX9-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_acquire_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_ld_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_ld_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_ld_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_ld_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_ld_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") acquire, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_monotonic_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_ld_monotonic_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_monotonic_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_ld_monotonic_single64
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_ld_monotonic_single64
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_monotonic_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_single32(ptr addrspace(1) %p, i32 %x) #0 {
+  ; GFX9-LABEL: name: wg_st_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_st_seq_cst_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_seq_cst_single32
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_seq_cst_single32
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_single64(ptr addrspace(1) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: wg_st_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_st_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_st_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 {
+  ; GFX9-LABEL: name: wg_st_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 1, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_st_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_release_single64(ptr addrspace(1) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: wg_st_release_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_release_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_st_release_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_st_release_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_release_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_release_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_release_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_single32(ptr addrspace(1) %p) #0 {
+  ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
+  ; GFX9-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W64-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W64-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W64-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_xchg_acq_rel_single64(ptr addrspace(1) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw xchg ptr addrspace(1) %p, i32 %x syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p, i32 %cmp, i32 %new) #2 {
+  ; GFX9-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_acquire_acquire_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_single32(ptr addrspace(3) %p) #0 {
+  ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_single64(ptr addrspace(3) %p) #1 {
+  ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 {
+  ; GFX9-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_st_release_single64(ptr addrspace(3) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: lds_wg_st_release_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_st_release_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_st_release_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_st_release_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_st_release_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_st_release_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_st_release_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(3) %p syncscope("workgroup") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_rmw_add_acq_rel_single64(ptr addrspace(3) %p) #1 {
+  ; GFX9-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX12-W32-NEXT:   $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX1250-NEXT:   $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(3) %p, i32 3 syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_unordered_single64(ptr addrspace(3) %p) #1 {
+  ; GFX9-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_cmpxchg_monotonic_acquire_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_wg_ld_acquire_single64(ptr addrspace(0) %p) #1 {
+  ; GFX9-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 1, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 8, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(0) %p syncscope("workgroup") acquire, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x) #2 {
+  ; GFX9-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX9-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 1, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX10-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   FLAT_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(0) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }



More information about the llvm-commits mailing list