[llvm] [AMDGPU] Use wavefront scope for single-wave workgroup synchronization (PR #187673)

Barbara Mitic via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 1 08:05:06 PDT 2026


https://github.com/barbara-amd updated https://github.com/llvm/llvm-project/pull/187673

>From 481bd1a3cf986e93f5ebe01b4233687077ecce28 Mon Sep 17 00:00:00 2001
From: bmitic_amdeng <Barbara.Mitic at amd.com>
Date: Fri, 20 Mar 2026 11:32:37 +0100
Subject: [PATCH 1/4] [AMDGPU] Relax workgroup fences for single-wave
 workgroups

When the whole workgroup fits in a single wavefront (i.e. at most one
wave), a fence syncscope("workgroup") can be lowered to
syncscope("wavefront"): there are no other waves in the workgroup for
that fence to synchronize across, so wavefront scope is sufficient.
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 20 +++++++++
 .../codegen-prepare-relax-workgroup-fence.ll  | 42 +++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8a80101d79a8e..380cf7cb56e0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -256,6 +256,7 @@ class AMDGPUCodeGenPrepareImpl
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
+  bool visitFenceInst(FenceInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
@@ -1998,6 +1999,25 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   return true;
 }
 
+// When all waves of the workgroup fit in one wave, workgroup fences can be
+// lowered to wavefront scope.
+bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
+  unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+  if (WGMaxSize > ST.getWavefrontSize())
+    return false;
+
+  SyncScope::ID WorkgroupSSID =
+      F.getContext().getOrInsertSyncScopeID("workgroup");
+  SyncScope::ID WavefrontSSID =
+      F.getContext().getOrInsertSyncScopeID("wavefront");
+
+  if (I.getSyncScopeID() != WorkgroupSSID)
+    return false;
+
+  I.setSyncScopeID(WavefrontSSID);
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   Intrinsic::ID IID = I.getIntrinsicID();
   switch (IID) {
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
new file mode 100644
index 0000000000000..b8aecf451e31c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    fence syncscope("wavefront") acq_rel
+; CHECK-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
+; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE32-NEXT:    fence syncscope("workgroup") acq_rel
+; WAVE32-NEXT:    ret void
+;
+; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE64-NEXT:    fence syncscope("wavefront") acq_rel
+; WAVE64-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
+; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    fence syncscope("workgroup") acq_rel
+; CHECK-NEXT:    ret void
+;
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }

>From e74c69993cb6efed9f708582b3b13f82eca1ceb4 Mon Sep 17 00:00:00 2001
From: barbara-amd <Barbara.Mitic at amd.com>
Date: Tue, 24 Mar 2026 13:47:04 +0100
Subject: [PATCH 2/4] [AMDGPU] Refine single-wave workgroup fence lowering

Keep workgroup scope on the fence; when the workgroup is known to fit in a
single wave, use wavefront scope only for the wait sequence emitted by the
legalizer.
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    | 20 --------
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  | 22 ++++++---
 .../codegen-prepare-relax-workgroup-fence.ll  | 42 ----------------
 ...y-legalizer-single-wave-workgroup-fence.ll | 49 +++++++++++++++++++
 4 files changed, 65 insertions(+), 68 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 380cf7cb56e0c..8a80101d79a8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -256,7 +256,6 @@ class AMDGPUCodeGenPrepareImpl
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
-  bool visitFenceInst(FenceInst &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitFMinLike(IntrinsicInst &I);
@@ -1999,25 +1998,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   return true;
 }
 
-// When all waves of the workgroup fit in one wave, workgroup fences can be
-// lowered to wavefront scope.
-bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
-  unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
-  if (WGMaxSize > ST.getWavefrontSize())
-    return false;
-
-  SyncScope::ID WorkgroupSSID =
-      F.getContext().getOrInsertSyncScopeID("workgroup");
-  SyncScope::ID WavefrontSSID =
-      F.getContext().getOrInsertSyncScopeID("wavefront");
-
-  if (I.getSyncScopeID() != WorkgroupSSID)
-    return false;
-
-  I.setSyncScopeID(WavefrontSSID);
-  return true;
-}
-
 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
   Intrinsic::ID IID = I.getIntrinsicID();
   switch (IID) {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index e5f352a3ed110..6b694291854fe 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -592,7 +592,7 @@ class SIMemoryLegalizer final {
                    MachineBasicBlock::iterator &MI);
   /// Expands atomic fence operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicFence(const SIMemOpInfo &MOI,
+  bool expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
                          MachineBasicBlock::iterator &MI);
   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
@@ -2326,7 +2326,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   return Changed;
 }
 
-bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
+bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
@@ -2335,11 +2335,21 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
 
   const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
 
+  // When all waves of the workgroup fit in one wave, workgroup fences can be
+  // lowered to wavefront scope.
+  SIAtomicScope ScopeForFence = MOI.getScope();
+  if (ScopeForFence == SIAtomicScope::WORKGROUP) {
+    const Function &F = MI->getMF()->getFunction();
+    const unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+    if (WGMaxSize <= ST.getWavefrontSize())
+      ScopeForFence = SIAtomicScope::WAVEFRONT;
+  }
+
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Acquire) {
       // Acquire fences only need to wait on the previous atomic they pair with.
-      Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+      Changed |= CC->insertWait(MI, ScopeForFence, OrderingAddrSpace,
                                 SIMemOp::LOAD | SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
                                 Position::BEFORE, Order, /*AtomicsOnly=*/true);
@@ -2355,7 +2365,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
       /// generate a fence. Could add support in this file for
       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
       /// adding S_WAITCNT before a S_BARRIER.
-      Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
+      Changed |= CC->insertRelease(MI, ScopeForFence, OrderingAddrSpace,
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);
 
@@ -2367,7 +2377,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent)
-      Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
+        Changed |= CC->insertAcquire(MI, ScopeForFence, OrderingAddrSpace,
                                    Position::BEFORE);
 
     return Changed;
@@ -2492,7 +2502,7 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
           Changed |= expandLDSDMA(*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
-          Changed |= expandAtomicFence(*MOI, MI);
+          Changed |= expandAtomicFence(ST,*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
           Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
       }
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
deleted file mode 100644
index b8aecf451e31c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
-
-define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
-; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    fence syncscope("wavefront") acq_rel
-; CHECK-NEXT:    ret void
-;
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
-; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
-; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
-; WAVE32-NEXT:    fence syncscope("workgroup") acq_rel
-; WAVE32-NEXT:    ret void
-;
-; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
-; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
-; WAVE64-NEXT:    fence syncscope("wavefront") acq_rel
-; WAVE64-NEXT:    ret void
-;
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
-; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    fence syncscope("workgroup") acq_rel
-; CHECK-NEXT:    ret void
-;
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
-attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
-attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
new file mode 100644
index 0000000000000..151d326d5cb73
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup32() #0 {
+  ; CHECK-LABEL: name: single_wave_workgroup32
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64() #1 {
+  ; WAVE32-LABEL: name: single_wave_workgroup64
+  ; WAVE32: bb.0 (%ir-block.0):
+  ; WAVE32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; WAVE32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; WAVE32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; WAVE32-NEXT:   S_ENDPGM 0
+  ;
+  ; WAVE64-LABEL: name: single_wave_workgroup64
+  ; WAVE64: bb.0 (%ir-block.0):
+  ; WAVE64-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup() #2 {
+  ; CHECK-LABEL: name: multi_wave_workgroup
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_STORECNT_soft 0
+  ; CHECK-NEXT:   S_WAIT_DSCNT_soft 0
+  ; CHECK-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
+

>From a21ebafea093a01457e48d710440e7ecf756afa6 Mon Sep 17 00:00:00 2001
From: barbara-amd <Barbara.Mitic at amd.com>
Date: Mon, 30 Mar 2026 16:47:27 +0200
Subject: [PATCH 3/4] [AMDGPU] Demote workgroup atomics to wavefront for
 single-wave work-groups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the kernel’s maximum flat work-group size is at most the wavefront
size, workgroup-scoped synchronization matches wavefront scope.
SIMemoryLegalizer applies that demotion in SIMemOpInfo for fences and for
atomic loads, stores, atomicrmw, and cmpxchg whose ordering is non-relaxed.
---
 llvm/docs/AMDGPUUsage.rst                     |   13 +
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |   44 +-
 .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll |  184 +-
 .../CodeGen/AMDGPU/global-saddr-atomics.ll    |  672 ++--
 ...y-legalizer-single-wave-workgroup-fence.ll |   49 -
 ...-legalizer-single-wave-workgroup-memops.ll | 2759 +++++++++++++++++
 6 files changed, 3053 insertions(+), 668 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 1ede5ca2d4cf6..6d73a4b532c5e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -7141,6 +7141,19 @@ treated as non-atomic.
 A memory synchronization scope wider than work-group is not meaningful for the
 group (LDS) address space and is treated as work-group.
 
+When a work-group's maximum flat work-group size does not exceed the wavefront
+size, the work-group fits within a single wavefront. In this case, LLVM
+``workgroup`` synchronization scope is equivalent to ``wavefront`` scope.
+
+If the compiler can determine this bound (e.g., via ``amdgpu-flat-work-group-size``),
+the AMDGPU backend optimizes ``workgroup`` scope operations by lowering them to
+``wavefront``-scoped machine instructions.
+
+It applies to atomic ``load``, ``store``, ``atomicrmw``, and ``cmpxchg``
+instructions, and to ``fence`` instructions, when they use synchronizing memory
+orderings (``acquire``, ``release``, ``acq_rel``, or ``seq_cst``).
+
+
 The memory model does not support the region address space which is treated as
 non-atomic.
 
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6b694291854fe..53c10d6700c5a 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -117,7 +117,8 @@ class SIMemOpInfo final {
       bool IsCrossAddressSpaceOrdering = true,
       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
       bool IsVolatile = false, bool IsNonTemporal = false,
-      bool IsLastUse = false, bool IsCooperative = false)
+      bool IsLastUse = false, bool IsCooperative = false,
+      const Function *ScopeDemotionFn = nullptr)
       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
@@ -165,6 +166,19 @@ class SIMemOpInfo final {
     // AGENT scope as a conservatively correct alternative.
     if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
       this->Scope = SIAtomicScope::AGENT;
+
+    // When max flat work-group size is at most the wavefront size, the
+    // work-group fits in a single wave, so LLVM workgroup scope matches
+    // wavefront scope. Demote workgroup → wavefront here for fences and for
+    // atomics with ordering stronger than monotonic.
+    if (ScopeDemotionFn && this->Scope == SIAtomicScope::WORKGROUP &&
+        (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
+         llvm::isStrongerThan(this->FailureOrdering,
+                              AtomicOrdering::Monotonic)) &&
+        ST.getFlatWorkGroupSizes(*ScopeDemotionFn).second <=
+            ST.getWavefrontSize()) {
+      this->Scope = SIAtomicScope::WAVEFRONT;
+    }
   }
 
 public:
@@ -592,7 +606,7 @@ class SIMemoryLegalizer final {
                    MachineBasicBlock::iterator &MI);
   /// Expands atomic fence operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
-  bool expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
+  bool expandAtomicFence(const SIMemOpInfo &MOI,
                          MachineBasicBlock::iterator &MI);
   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
   /// instructions are added/deleted or \p MI is modified, false otherwise.
@@ -809,7 +823,8 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   }
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
-                     IsNonTemporal, IsLastUse, IsCooperative);
+                     IsNonTemporal, IsLastUse, IsCooperative,
+                     &MI->getMF()->getFunction());
 }
 
 std::optional<SIMemOpInfo>
@@ -878,7 +893,8 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
 
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
                      SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
-                     AtomicOrdering::NotAtomic);
+                     AtomicOrdering::NotAtomic, false, false, false, false,
+                     &MI->getMF()->getFunction());
 }
 
 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -2326,7 +2342,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   return Changed;
 }
 
-bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
+bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
@@ -2335,21 +2351,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
 
   const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
 
-  // When all waves of the workgroup fit in one wave, workgroup fences can be
-  // lowered to wavefront scope.
-  SIAtomicScope ScopeForFence = MOI.getScope();
-  if (ScopeForFence == SIAtomicScope::WORKGROUP) {
-    const Function &F = MI->getMF()->getFunction();
-    const unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
-    if (WGMaxSize <= ST.getWavefrontSize())
-      ScopeForFence = SIAtomicScope::WAVEFRONT;
-  }
-
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Acquire) {
       // Acquire fences only need to wait on the previous atomic they pair with.
-      Changed |= CC->insertWait(MI, ScopeForFence, OrderingAddrSpace,
+      Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
                                 SIMemOp::LOAD | SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
                                 Position::BEFORE, Order, /*AtomicsOnly=*/true);
@@ -2365,7 +2371,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
       /// generate a fence. Could add support in this file for
       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
       /// adding S_WAITCNT before a S_BARRIER.
-      Changed |= CC->insertRelease(MI, ScopeForFence, OrderingAddrSpace,
+      Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);
 
@@ -2377,7 +2383,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent)
-        Changed |= CC->insertAcquire(MI, ScopeForFence, OrderingAddrSpace,
+        Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
                                    Position::BEFORE);
 
     return Changed;
@@ -2502,7 +2508,7 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
           Changed |= expandLDSDMA(*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
-          Changed |= expandAtomicFence(ST,*MOI, MI);
+          Changed |= expandAtomicFence(*MOI, MI);
         else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
           Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
       }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 275825a973415..9d0d43d900026 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -5916,7 +5916,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn:
@@ -5925,7 +5924,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_smax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn:
@@ -5935,7 +5933,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -5948,7 +5945,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128:
@@ -5960,7 +5956,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_smax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128:
@@ -5973,7 +5968,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6002,16 +5996,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX1250-SDAG-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB58_5
 ; GFX1250-SDAG-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-SDAG-NEXT:  .LBB58_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6045,16 +6041,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX1250-GISEL-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB58_5
 ; GFX1250-GISEL-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-GISEL-NEXT:  .LBB58_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6084,11 +6082,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX950-SDAG-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB58_5
 ; GFX950-SDAG-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6097,6 +6094,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
@@ -6128,11 +6126,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB58_4
 ; GFX950-GISEL-NEXT:  .LBB58_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB58_5
 ; GFX950-GISEL-NEXT:  .LBB58_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6141,6 +6138,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
@@ -6182,16 +6180,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX1250-SDAG-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB59_5
 ; GFX1250-SDAG-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-SDAG-NEXT:  .LBB59_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6228,16 +6228,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX1250-GISEL-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB59_5
 ; GFX1250-GISEL-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-GISEL-NEXT:  .LBB59_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6270,11 +6272,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX950-SDAG-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB59_5
 ; GFX950-SDAG-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6283,6 +6284,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
@@ -6317,11 +6319,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB59_4
 ; GFX950-GISEL-NEXT:  .LBB59_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB59_5
 ; GFX950-GISEL-NEXT:  .LBB59_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6330,6 +6331,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
@@ -6372,7 +6374,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB60_2
@@ -6410,7 +6411,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB60_2
@@ -6443,7 +6443,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB60_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6480,7 +6479,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB60_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6528,7 +6526,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB61_2
@@ -6569,7 +6566,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB61_2
@@ -6605,7 +6601,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB61_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6646,7 +6641,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB61_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6752,7 +6746,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn:
@@ -6761,7 +6754,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_smin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn:
@@ -6771,7 +6763,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6784,7 +6775,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128:
@@ -6796,7 +6786,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_smin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128:
@@ -6809,7 +6798,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_smin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6838,16 +6826,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX1250-SDAG-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB66_5
 ; GFX1250-SDAG-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-SDAG-NEXT:  .LBB66_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6881,16 +6871,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX1250-GISEL-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB66_5
 ; GFX1250-GISEL-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-GISEL-NEXT:  .LBB66_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6920,11 +6912,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX950-SDAG-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB66_5
 ; GFX950-SDAG-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6933,6 +6924,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
@@ -6964,11 +6956,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB66_4
 ; GFX950-GISEL-NEXT:  .LBB66_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB66_5
 ; GFX950-GISEL-NEXT:  .LBB66_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6977,6 +6968,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
@@ -7018,16 +7010,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX1250-SDAG-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB67_5
 ; GFX1250-SDAG-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-SDAG-NEXT:  .LBB67_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7064,16 +7058,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX1250-GISEL-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB67_5
 ; GFX1250-GISEL-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-GISEL-NEXT:  .LBB67_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7106,11 +7102,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX950-SDAG-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB67_5
 ; GFX950-SDAG-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7119,6 +7114,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
@@ -7153,11 +7149,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB67_4
 ; GFX950-GISEL-NEXT:  .LBB67_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB67_5
 ; GFX950-GISEL-NEXT:  .LBB67_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7166,6 +7161,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
@@ -7208,7 +7204,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB68_2
@@ -7246,7 +7241,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB68_2
@@ -7279,7 +7273,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB68_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7316,7 +7309,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB68_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7364,7 +7356,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB69_2
@@ -7405,7 +7396,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB69_2
@@ -7441,7 +7431,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB69_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7482,7 +7471,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB69_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_smin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7588,7 +7576,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn:
@@ -7597,7 +7584,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_umax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn:
@@ -7607,7 +7593,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -7620,7 +7605,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128:
@@ -7632,7 +7616,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_umax v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128:
@@ -7645,7 +7628,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umax v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -7674,16 +7656,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX1250-SDAG-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB74_5
 ; GFX1250-SDAG-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-SDAG-NEXT:  .LBB74_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7717,16 +7701,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX1250-GISEL-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB74_5
 ; GFX1250-GISEL-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-GISEL-NEXT:  .LBB74_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7756,11 +7742,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX950-SDAG-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB74_5
 ; GFX950-SDAG-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7769,6 +7754,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
@@ -7800,11 +7786,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB74_4
 ; GFX950-GISEL-NEXT:  .LBB74_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB74_5
 ; GFX950-GISEL-NEXT:  .LBB74_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7813,6 +7798,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
@@ -7854,16 +7840,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX1250-SDAG-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB75_5
 ; GFX1250-SDAG-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-SDAG-NEXT:  .LBB75_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7900,16 +7888,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX1250-GISEL-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB75_5
 ; GFX1250-GISEL-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-GISEL-NEXT:  .LBB75_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7942,11 +7932,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX950-SDAG-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB75_5
 ; GFX950-SDAG-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7955,6 +7944,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
@@ -7989,11 +7979,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB75_4
 ; GFX950-GISEL-NEXT:  .LBB75_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB75_5
 ; GFX950-GISEL-NEXT:  .LBB75_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8002,6 +7991,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8044,7 +8034,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB76_2
@@ -8082,7 +8071,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB76_2
@@ -8115,7 +8103,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB76_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8152,7 +8139,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB76_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8200,7 +8186,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB77_2
@@ -8241,7 +8226,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB77_2
@@ -8277,7 +8261,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB77_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8318,7 +8301,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB77_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8424,7 +8406,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3]
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn:
@@ -8433,7 +8414,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX950-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
 ; GFX950-SDAG-NEXT:    flat_atomic_umin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn:
@@ -8443,7 +8423,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -8456,7 +8435,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128:
@@ -8468,7 +8446,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX950-SDAG-NEXT:    flat_atomic_umin v[0:1], v2
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128:
@@ -8481,7 +8458,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX950-GISEL-NEXT:    flat_atomic_umin v[2:3], v1
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -8510,16 +8486,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX1250-SDAG-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB82_5
 ; GFX1250-SDAG-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-SDAG-NEXT:  .LBB82_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8553,16 +8531,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX1250-GISEL-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB82_5
 ; GFX1250-GISEL-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-GISEL-NEXT:  .LBB82_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8592,11 +8572,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX950-SDAG-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB82_5
 ; GFX950-SDAG-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8605,6 +8584,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
@@ -8636,11 +8616,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB82_4
 ; GFX950-GISEL-NEXT:  .LBB82_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB82_5
 ; GFX950-GISEL-NEXT:  .LBB82_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8649,6 +8628,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8690,16 +8670,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX1250-SDAG-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_branch .LBB83_5
 ; GFX1250-SDAG-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-SDAG-NEXT:  .LBB83_4: ; %atomicrmw.private
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8736,16 +8718,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX1250-GISEL-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_branch .LBB83_5
 ; GFX1250-GISEL-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-GISEL-NEXT:  .LBB83_4: ; %atomicrmw.private
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8778,11 +8762,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX950-SDAG-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX950-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_branch .LBB83_5
 ; GFX950-SDAG-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8791,6 +8774,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-SDAG-NEXT:    v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
@@ -8825,11 +8809,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    s_cbranch_execnz .LBB83_4
 ; GFX950-GISEL-NEXT:  .LBB83_2: ; %atomicrmw.phi
 ; GFX950-GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_branch .LBB83_5
 ; GFX950-GISEL-NEXT:  .LBB83_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8838,6 +8821,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX950-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX950-GISEL-NEXT:    s_nop 1
 ; GFX950-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8880,7 +8864,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB84_2
@@ -8918,7 +8901,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB84_2
@@ -8951,7 +8933,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB84_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8988,7 +8969,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB84_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -9036,7 +9016,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3]
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB85_2
@@ -9077,7 +9056,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB85_2
@@ -9113,7 +9091,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-SDAG-NEXT:    s_endpgm
 ; GFX950-SDAG-NEXT:  .LBB85_3: ; %atomicrmw.global
 ; GFX950-SDAG-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX950-SDAG-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -9154,7 +9131,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX950-GISEL-NEXT:  .LBB85_3: ; %atomicrmw.global
 ; GFX950-GISEL-NEXT:    flat_atomic_umin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX950-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX950-GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 58f7c4340276d..466d7152a0d84 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -2142,31 +2142,22 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2176,31 +2167,22 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2211,30 +2193,19 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2243,30 +2214,19 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2276,31 +2236,22 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2310,31 +2261,22 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
 }
 
 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_max_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2345,30 +2287,19 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
 }
 
 define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2377,30 +2308,19 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_max_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_max_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2414,31 +2334,22 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2448,31 +2359,22 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2483,30 +2385,19 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2515,30 +2406,19 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2548,31 +2428,22 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2582,31 +2453,22 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
 }
 
 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_min_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2617,30 +2479,19 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
 }
 
 define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2649,30 +2500,19 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_min_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_min_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2686,31 +2526,22 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2720,31 +2551,22 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2755,30 +2577,19 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2787,30 +2598,19 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2820,31 +2620,22 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2854,31 +2645,22 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2889,30 +2671,19 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
 }
 
 define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2921,30 +2692,19 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umax_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2958,31 +2718,22 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
 ; --------------------------------------------------------------------------------
 
 define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i32_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i32_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2992,31 +2743,22 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 }
 
 define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i32_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3027,30 +2769,19 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v1, s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i32_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v1, s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i32_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v1, s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3059,30 +2790,19 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin v0, v1, s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin v0, v1, s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i32_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin v0, v1, s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u32 v0, v1, s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3092,31 +2812,22 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
 }
 
 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_rtn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i64_rtn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i64_rtn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_rtn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_rtn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3126,31 +2837,22 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
 }
 
 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i64_rtn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3161,30 +2863,19 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
 }
 
 define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_nortn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i64_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i64_nortn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_nortn:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3]
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3193,30 +2884,19 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 }
 
 define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT:    s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
+; GCN-LABEL: global_umin_saddr_i64_nortn_neg128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128
 ; GFX12-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
deleted file mode 100644
index 151d326d5cb73..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE64
-
-define amdgpu_kernel void @single_wave_workgroup32() #0 {
-  ; CHECK-LABEL: name: single_wave_workgroup32
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   S_ENDPGM 0
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @single_wave_workgroup64() #1 {
-  ; WAVE32-LABEL: name: single_wave_workgroup64
-  ; WAVE32: bb.0 (%ir-block.0):
-  ; WAVE32-NEXT:   S_WAIT_BVHCNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_SAMPLECNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_LOADCNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_STORECNT_soft 0
-  ; WAVE32-NEXT:   S_WAIT_DSCNT_soft 0
-  ; WAVE32-NEXT:   GLOBAL_INV 8, implicit $exec
-  ; WAVE32-NEXT:   S_ENDPGM 0
-  ;
-  ; WAVE64-LABEL: name: single_wave_workgroup64
-  ; WAVE64: bb.0 (%ir-block.0):
-  ; WAVE64-NEXT:   S_ENDPGM 0
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-define amdgpu_kernel void @multi_wave_workgroup() #2 {
-  ; CHECK-LABEL: name: multi_wave_workgroup
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   S_WAIT_BVHCNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_SAMPLECNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_LOADCNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_STORECNT_soft 0
-  ; CHECK-NEXT:   S_WAIT_DSCNT_soft 0
-  ; CHECK-NEXT:   GLOBAL_INV 8, implicit $exec
-  ; CHECK-NEXT:   S_ENDPGM 0
-  fence syncscope("workgroup") acq_rel
-  ret void
-}
-
-attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
-attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
-attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
new file mode 100644
index 0000000000000..aaa295992c361
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
@@ -0,0 +1,2759 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX1250
+
+
+define amdgpu_kernel void @wg_fence_acq_rel_single32() #0 {
+  ; GFX9-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acq_rel_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_acq_rel_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acq_rel_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_acq_rel_multi() #2 {
+  ; GFX9-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acq_rel_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_acquire_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") acquire
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_release_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_release_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_release_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_release_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_release_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_release_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_release_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_release_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") release
+  ret void
+}
+
+define amdgpu_kernel void @wg_fence_seq_cst_single64() #1 {
+  ; GFX9-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_fence_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  fence syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_single32(ptr addrspace(1) %p) #0 {
+  ; GFX9-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 {
+  ; GFX9-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_acquire_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_ld_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_ld_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 16240
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_ld_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_ld_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_ld_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") acquire, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_ld_monotonic_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_ld_monotonic_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_ld_monotonic_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_ld_monotonic_single64
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: wg_ld_monotonic_single64
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_ld_monotonic_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_single32(ptr addrspace(1) %p, i32 %x) #0 {
+  ; GFX9-LABEL: name: wg_st_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_st_seq_cst_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_seq_cst_single32
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_seq_cst_single32
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_single64(ptr addrspace(1) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: wg_st_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_st_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_st_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 {
+  ; GFX9-LABEL: name: wg_st_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 1, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_st_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_st_release_single64(ptr addrspace(1) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: wg_st_release_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_st_release_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_st_release_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_st_release_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_st_release_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_st_release_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_st_release_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_single32(ptr addrspace(1) %p) #0 {
+  ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_single64(ptr addrspace(1) %p) #1 {
+  ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
+  ; GFX9-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W64-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W64-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W64-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_rmw_xchg_acq_rel_single64(ptr addrspace(1) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_rmw_xchg_acq_rel_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw xchg ptr addrspace(1) %p, i32 %x syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p, i32 %cmp, i32 %new) #2 {
+  ; GFX9-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_acquire_acquire_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acquire acquire
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_single32(ptr addrspace(3) %p) #0 {
+  ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single32
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_single64(ptr addrspace(3) %p) #1 {
+  ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 {
+  ; GFX9-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX10-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX12-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_st_release_single64(ptr addrspace(3) %p, i32 %x) #1 {
+  ; GFX9-LABEL: name: lds_wg_st_release_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_st_release_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_st_release_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_st_release_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_st_release_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_st_release_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_st_release_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(3) %p syncscope("workgroup") release, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_rmw_add_acq_rel_single64(ptr addrspace(3) %p) #1 {
+  ; GFX9-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX9-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX9-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX9-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.11):
+  ; GFX9-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX9-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX9-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.2 (%ir-block.16):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX942-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX942-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX942-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX942-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.1 (%ir-block.11):
+  ; GFX942-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX942-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX942-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT: bb.2 (%ir-block.16):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX10-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W32-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+  ; GFX10-W32-NEXT:   $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX10-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX10-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX10-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX10-W64-NEXT:   V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX10-W64-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX10-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX10-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX12-W32-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W32-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+  ; GFX12-W32-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W32-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX12-W32-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX12-W32-NEXT:   $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   $sgpr2_sgpr3 = S_MOV_B64 $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX12-W64-NEXT:   renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+  ; GFX12-W64-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+  ; GFX12-W64-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-W64-NEXT:   liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GFX12-W64-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   $sgpr0 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   $sgpr1 = S_MOV_B32 $exec_lo
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+  ; GFX1250-NEXT:   V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+  ; GFX1250-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.1 (%ir-block.7):
+  ; GFX1250-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX1250-NEXT:   liveins: $sgpr0, $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+  ; GFX1250-NEXT:   renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+  ; GFX1250-NEXT:   $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT: bb.2 (%ir-block.11):
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = atomicrmw add ptr addrspace(3) %p, i32 3 syncscope("workgroup") acq_rel
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-W32-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_unordered_single64(ptr addrspace(3) %p) #1 {
+  ; GFX9-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_ld_unordered_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") unordered, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lds_wg_cmpxchg_monotonic_acquire_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
+  ; GFX9-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W32-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-W64-NEXT:   DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") monotonic acquire
+  ret void
+}
+
+define amdgpu_kernel void @flat_wg_ld_acquire_single64(ptr addrspace(0) %p) #1 {
+  ; GFX9-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX9-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W32-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX10-W32: bb.0 (%ir-block.0):
+  ; GFX10-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W32-NEXT: {{  $}}
+  ; GFX10-W32-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W32-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX10-W32-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX10-W32-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 1, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX10-W32-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-W32-NEXT:   BUFFER_GL0_INV implicit $exec
+  ; GFX10-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-W64-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX10-W64: bb.0 (%ir-block.0):
+  ; GFX10-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-W64-NEXT: {{  $}}
+  ; GFX10-W64-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX10-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX10-W64-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX10-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W32-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 8, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   GLOBAL_INV 8, implicit $exec
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W64-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: flat_wg_ld_acquire_single64
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX1250-NEXT:   dead renamable $vgpr0 = FLAT_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  %v = load atomic i32, ptr addrspace(0) %p syncscope("workgroup") acquire, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x) #2 {
+  ; GFX9-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX9: bb.0 (%ir-block.0):
+  ; GFX9-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX9-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX9-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX9-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX9-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX9-NEXT:   S_WAITCNT_soft 49279
+  ; GFX9-NEXT:   S_WAITCNT_lds_direct
+  ; GFX9-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX9-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX942-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX942: bb.0 (%ir-block.0):
+  ; GFX942-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX942-NEXT: {{  $}}
+  ; GFX942-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX942-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX942-NEXT:   $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX942-NEXT:   S_WAITCNT_soft 49279
+  ; GFX942-NEXT:   S_WAITCNT_lds_direct
+  ; GFX942-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 1, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX942-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX10-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX10: bb.0 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX10-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+  ; GFX10-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX10-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX10-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX10-NEXT:   S_WAITCNT_soft 112
+  ; GFX10-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+  ; GFX10-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W32-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX12-W32: bb.0 (%ir-block.0):
+  ; GFX12-W32-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W32-NEXT: {{  $}}
+  ; GFX12-W32-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W32-NEXT:   $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W32-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W32-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W32-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W32-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX12-W32-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-W64-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX12-W64: bb.0 (%ir-block.0):
+  ; GFX12-W64-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX12-W64-NEXT: {{  $}}
+  ; GFX12-W64-NEXT:   renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX12-W64-NEXT:   $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+  ; GFX12-W64-NEXT:   $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+  ; GFX12-W64-NEXT:   $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+  ; GFX12-W64-NEXT:   S_WAIT_BVHCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_SAMPLECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX12-W64-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX12-W64-NEXT:   FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX12-W64-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX1250-LABEL: name: flat_wg_st_seq_cst_multi
+  ; GFX1250: bb.0 (%ir-block.0):
+  ; GFX1250-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX1250-NEXT: {{  $}}
+  ; GFX1250-NEXT:   S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+  ; GFX1250-NEXT:   early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+  ; GFX1250-NEXT:   renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+  ; GFX1250-NEXT:   S_WAIT_LOADCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_STORECNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_DSCNT_soft 0
+  ; GFX1250-NEXT:   S_WAIT_XCNT_soft 0
+  ; GFX1250-NEXT:   FLAT_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+  ; GFX1250-NEXT:   S_ENDPGM 0
+  store atomic i32 %x, ptr addrspace(0) %p syncscope("workgroup") seq_cst, align 4
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }

>From ddc4ee20069982870c3ae0b7e30b62b82491f524 Mon Sep 17 00:00:00 2001
From: bmitic_amdeng <Barbara.Mitic at amd.com>
Date: Wed, 1 Apr 2026 16:33:56 +0200
Subject: [PATCH 4/4] [AMDGPU] Cache workgroup to wavefront demotion query in
 SIMemOpAccess

SIMemOpAccess now takes the IR Function for the current MachineFunction and
calls isSingleWavefrontWorkgroup once in its constructor. Pass the cached
boolean into SIMemOpInfo instead of re-querying subtarget state for every
memory op and fence.
---
 llvm/docs/AMDGPUUsage.rst                     |  1 -
 .../Target/AMDGPU/AMDGPULowerIntrinsics.cpp   |  6 ++--
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |  4 +++
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  4 +++
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  | 28 ++++++++++---------
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6d73a4b532c5e..4fca2a68aeeee 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -7153,7 +7153,6 @@ It applies to atomic ``load``, ``store``, ``atomicrmw``, and ``cmpxchg``
 instructions, and to ``fence`` instructions, when they use synchronizing memory
 orderings (``acquire``, ``release``, ``acq_rel``, or ``seq_cst``).
 
-
 The memory model does not support the region address space which is treated as
 non-atomic.
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index d490788a97685..e089498693b2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -94,10 +94,8 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
   bool IsSingleWaveWG = false;
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
-    IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
-  }
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
+    IsSingleWaveWG = ST.isSingleWavefrontWorkgroup(*I.getFunction());
 
   IRBuilder<> B(&I);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1cd693f46500d..eefbd9e82a943 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -173,6 +173,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
   return Requested;
 }
 
+bool AMDGPUSubtarget::isSingleWavefrontWorkgroup(const Function &F) const {
+  return getFlatWorkGroupSizes(F).second <= getWavefrontSize();
+}
+
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
     std::pair<unsigned, unsigned> RequestedWavesPerEU,
     std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index d23f94243a459..07746c087904d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -80,6 +80,10 @@ class AMDGPUSubtarget {
   /// be converted to integer, or violate subtarget's specifications.
   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
 
+  /// \returns true if the maximum flat work-group size for \p F is at most the
+  /// wavefront size, so a work-group may fit in a single wavefront.
+  bool isSingleWavefrontWorkgroup(const Function &F) const;
+
   /// \returns The required size of workgroups that will be used to execute \p F
   /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
   /// metadata. Otherwise, returns std::nullopt.
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 53c10d6700c5a..0d67a187e10af 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -118,7 +118,7 @@ class SIMemOpInfo final {
       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
       bool IsVolatile = false, bool IsNonTemporal = false,
       bool IsLastUse = false, bool IsCooperative = false,
-      const Function *ScopeDemotionFn = nullptr)
+      bool CanDemoteWorkgroupToWavefront = false)
       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
@@ -171,14 +171,12 @@ class SIMemOpInfo final {
     // work-group fits in a single wave, so LLVM workgroup scope matches
     // wavefront scope. Demote workgroup → wavefront here for fences and for
     // atomics with ordering stronger than monotonic.
-    if (ScopeDemotionFn && this->Scope == SIAtomicScope::WORKGROUP &&
+    if (CanDemoteWorkgroupToWavefront &&
+        this->Scope == SIAtomicScope::WORKGROUP &&
         (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
          llvm::isStrongerThan(this->FailureOrdering,
-                              AtomicOrdering::Monotonic)) &&
-        ST.getFlatWorkGroupSizes(*ScopeDemotionFn).second <=
-            ST.getWavefrontSize()) {
+                              AtomicOrdering::Monotonic)))
       this->Scope = SIAtomicScope::WAVEFRONT;
-    }
   }
 
 public:
@@ -249,6 +247,7 @@ class SIMemOpAccess final {
 private:
   const AMDGPUMachineModuleInfo *MMI = nullptr;
   const GCNSubtarget &ST;
+  const bool CanDemoteWorkgroupToWavefront;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -272,7 +271,8 @@ class SIMemOpAccess final {
 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
-  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
+  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST,
+                const Function &F);
 
   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
@@ -752,8 +752,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
 }
 
 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
-                             const GCNSubtarget &ST)
-    : MMI(&MMI_), ST(ST) {}
+                             const GCNSubtarget &ST, const Function &F)
+    : MMI(&MMI_), ST(ST),
+      CanDemoteWorkgroupToWavefront(ST.isSingleWavefrontWorkgroup(F)) {}
 
 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
@@ -824,7 +825,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
                      IsNonTemporal, IsLastUse, IsCooperative,
-                     &MI->getMF()->getFunction());
+                     CanDemoteWorkgroupToWavefront);
 }
 
 std::optional<SIMemOpInfo>
@@ -894,7 +895,7 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
                      SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
                      AtomicOrdering::NotAtomic, false, false, false, false,
-                     &MI->getMF()->getFunction());
+                     CanDemoteWorkgroupToWavefront);
 }
 
 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -2383,7 +2384,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::AcquireRelease ||
         Order == AtomicOrdering::SequentiallyConsistent)
-        Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
+      Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
                                    Position::BEFORE);
 
     return Changed;
@@ -2480,7 +2481,8 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
   bool Changed = false;
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
+  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST,
+                    MF.getFunction());
   CC = SICacheControl::create(ST);
 
   for (auto &MBB : MF) {



More information about the llvm-commits mailing list