[llvm] [AMDGPU] Adjust workgroup fence lowering for single-wave workgroups (PR #187673)
Barbara Mitic via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 30 07:50:11 PDT 2026
https://github.com/barbara-amd updated https://github.com/llvm/llvm-project/pull/187673
>From 481bd1a3cf986e93f5ebe01b4233687077ecce28 Mon Sep 17 00:00:00 2001
From: bmitic_amdeng <Barbara.Mitic at amd.com>
Date: Fri, 20 Mar 2026 11:32:37 +0100
Subject: [PATCH 1/3] [AMDGPU] Relax workgroup fences for single-wave
workgroups
When the whole workgroup fits in a single wavefront (i.e. at most one
wave), a fence syncscope("workgroup") can be lowered to
syncscope("wavefront"): there are no other waves in the workgroup for
that fence to synchronize across, so wavefront scope is sufficient.
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 20 +++++++++
.../codegen-prepare-relax-workgroup-fence.ll | 42 +++++++++++++++++++
2 files changed, 62 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8a80101d79a8e..380cf7cb56e0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -256,6 +256,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
+ bool visitFenceInst(FenceInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
@@ -1998,6 +1999,25 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
return true;
}
+// When all waves of the workgroup fit in one wave, workgroup fences can be
+// lowered to wavefront scope.
+bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
+ unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+ if (WGMaxSize > ST.getWavefrontSize())
+ return false;
+
+ SyncScope::ID WorkgroupSSID =
+ F.getContext().getOrInsertSyncScopeID("workgroup");
+ SyncScope::ID WavefrontSSID =
+ F.getContext().getOrInsertSyncScopeID("wavefront");
+
+ if (I.getSyncScopeID() != WorkgroupSSID)
+ return false;
+
+ I.setSyncScopeID(WavefrontSSID);
+ return true;
+}
+
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
Intrinsic::ID IID = I.getIntrinsicID();
switch (IID) {
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
new file mode 100644
index 0000000000000..b8aecf451e31c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: fence syncscope("wavefront") acq_rel
+; CHECK-NEXT: ret void
+;
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
+; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE32-NEXT: fence syncscope("workgroup") acq_rel
+; WAVE32-NEXT: ret void
+;
+; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
+; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; WAVE64-NEXT: fence syncscope("wavefront") acq_rel
+; WAVE64-NEXT: ret void
+;
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
+; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: fence syncscope("workgroup") acq_rel
+; CHECK-NEXT: ret void
+;
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
>From e74c69993cb6efed9f708582b3b13f82eca1ceb4 Mon Sep 17 00:00:00 2001
From: barbara-amd <Barbara.Mitic at amd.com>
Date: Tue, 24 Mar 2026 13:47:04 +0100
Subject: [PATCH 2/3] [AMDGPU] Refine single-wave workgroup fence lowering
Keep workgroup scope on the fence; when the workgroup is known to fit in a
single wave, use wavefront scope only for the wait sequence emitted by the
legalizer.
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 20 --------
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 22 ++++++---
.../codegen-prepare-relax-workgroup-fence.ll | 42 ----------------
...y-legalizer-single-wave-workgroup-fence.ll | 49 +++++++++++++++++++
4 files changed, 65 insertions(+), 68 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 380cf7cb56e0c..8a80101d79a8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -256,7 +256,6 @@ class AMDGPUCodeGenPrepareImpl
bool visitSelectInst(SelectInst &I);
bool visitPHINode(PHINode &I);
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
- bool visitFenceInst(FenceInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
@@ -1999,25 +1998,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
return true;
}
-// When all waves of the workgroup fit in one wave, workgroup fences can be
-// lowered to wavefront scope.
-bool AMDGPUCodeGenPrepareImpl::visitFenceInst(FenceInst &I) {
- unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
- if (WGMaxSize > ST.getWavefrontSize())
- return false;
-
- SyncScope::ID WorkgroupSSID =
- F.getContext().getOrInsertSyncScopeID("workgroup");
- SyncScope::ID WavefrontSSID =
- F.getContext().getOrInsertSyncScopeID("wavefront");
-
- if (I.getSyncScopeID() != WorkgroupSSID)
- return false;
-
- I.setSyncScopeID(WavefrontSSID);
- return true;
-}
-
bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
Intrinsic::ID IID = I.getIntrinsicID();
switch (IID) {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index e5f352a3ed110..6b694291854fe 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -592,7 +592,7 @@ class SIMemoryLegalizer final {
MachineBasicBlock::iterator &MI);
/// Expands atomic fence operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
- bool expandAtomicFence(const SIMemOpInfo &MOI,
+ bool expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
@@ -2326,7 +2326,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
return Changed;
}
-bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
+bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
@@ -2335,11 +2335,21 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
+ // When all waves of the workgroup fit in one wave, workgroup fences can be
+ // lowered to wavefront scope.
+ SIAtomicScope ScopeForFence = MOI.getScope();
+ if (ScopeForFence == SIAtomicScope::WORKGROUP) {
+ const Function &F = MI->getMF()->getFunction();
+ const unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
+ if (WGMaxSize <= ST.getWavefrontSize())
+ ScopeForFence = SIAtomicScope::WAVEFRONT;
+ }
+
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
if (Order == AtomicOrdering::Acquire) {
// Acquire fences only need to wait on the previous atomic they pair with.
- Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+ Changed |= CC->insertWait(MI, ScopeForFence, OrderingAddrSpace,
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE, Order, /*AtomicsOnly=*/true);
@@ -2355,7 +2365,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
/// generate a fence. Could add support in this file for
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
/// adding S_WAITCNT before a S_BARRIER.
- Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
+ Changed |= CC->insertRelease(MI, ScopeForFence, OrderingAddrSpace,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
@@ -2367,7 +2377,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
if (Order == AtomicOrdering::Acquire ||
Order == AtomicOrdering::AcquireRelease ||
Order == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
+ Changed |= CC->insertAcquire(MI, ScopeForFence, OrderingAddrSpace,
Position::BEFORE);
return Changed;
@@ -2492,7 +2502,7 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
Changed |= expandLDSDMA(*MOI, MI);
else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
- Changed |= expandAtomicFence(*MOI, MI);
+ Changed |= expandAtomicFence(ST,*MOI, MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
}
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
deleted file mode 100644
index b8aecf451e31c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-relax-workgroup-fence.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize32 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE32
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes=amdgpu-codegenprepare -o - < %s | FileCheck %s --check-prefixes=CHECK,WAVE64
-
-define amdgpu_kernel void @single_wave_workgroup(ptr addrspace(1) %out) #0 {
-; CHECK-LABEL: define amdgpu_kernel void @single_wave_workgroup(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: fence syncscope("wavefront") acq_rel
-; CHECK-NEXT: ret void
-;
- fence syncscope("workgroup") acq_rel
- ret void
-}
-
-define amdgpu_kernel void @single_wave_workgroup64(ptr addrspace(1) %out) #1 {
-; WAVE32-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
-; WAVE32-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
-; WAVE32-NEXT: fence syncscope("workgroup") acq_rel
-; WAVE32-NEXT: ret void
-;
-; WAVE64-LABEL: define amdgpu_kernel void @single_wave_workgroup64(
-; WAVE64-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
-; WAVE64-NEXT: fence syncscope("wavefront") acq_rel
-; WAVE64-NEXT: ret void
-;
- fence syncscope("workgroup") acq_rel
- ret void
-}
-
-define amdgpu_kernel void @multi_wave_workgroup(ptr addrspace(1) %out) #2 {
-; CHECK-LABEL: define amdgpu_kernel void @multi_wave_workgroup(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT: fence syncscope("workgroup") acq_rel
-; CHECK-NEXT: ret void
-;
- fence syncscope("workgroup") acq_rel
- ret void
-}
-
-attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
-attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
-attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
new file mode 100644
index 0000000000000..151d326d5cb73
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE64
+
+define amdgpu_kernel void @single_wave_workgroup32() #0 {
+ ; CHECK-LABEL: name: single_wave_workgroup32
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @single_wave_workgroup64() #1 {
+ ; WAVE32-LABEL: name: single_wave_workgroup64
+ ; WAVE32: bb.0 (%ir-block.0):
+ ; WAVE32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; WAVE32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; WAVE32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; WAVE32-NEXT: S_WAIT_STORECNT_soft 0
+ ; WAVE32-NEXT: S_WAIT_DSCNT_soft 0
+ ; WAVE32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; WAVE32-NEXT: S_ENDPGM 0
+ ;
+ ; WAVE64-LABEL: name: single_wave_workgroup64
+ ; WAVE64: bb.0 (%ir-block.0):
+ ; WAVE64-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @multi_wave_workgroup() #2 {
+ ; CHECK-LABEL: name: multi_wave_workgroup
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: S_WAIT_BVHCNT_soft 0
+ ; CHECK-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; CHECK-NEXT: S_WAIT_LOADCNT_soft 0
+ ; CHECK-NEXT: S_WAIT_STORECNT_soft 0
+ ; CHECK-NEXT: S_WAIT_DSCNT_soft 0
+ ; CHECK-NEXT: GLOBAL_INV 8, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
+
>From a21ebafea093a01457e48d710440e7ecf756afa6 Mon Sep 17 00:00:00 2001
From: barbara-amd <Barbara.Mitic at amd.com>
Date: Mon, 30 Mar 2026 16:47:27 +0200
Subject: [PATCH 3/3] [AMDGPU] Demote workgroup atomics to wavefront for
single-wave work-groups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When the kernel’s maximum flat work-group size is at most the wavefront
size, workgroup-scoped synchronization matches wavefront scope.
SIMemoryLegalizer applies that demotion in SIMemOpInfo for fences and for
atomic loads, stores, atomicrmw, and cmpxchg whose ordering is non-relaxed.
---
llvm/docs/AMDGPUUsage.rst | 13 +
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 44 +-
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 184 +-
.../CodeGen/AMDGPU/global-saddr-atomics.ll | 672 ++--
...y-legalizer-single-wave-workgroup-fence.ll | 49 -
...-legalizer-single-wave-workgroup-memops.ll | 2759 +++++++++++++++++
6 files changed, 3053 insertions(+), 668 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 1ede5ca2d4cf6..6d73a4b532c5e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -7141,6 +7141,19 @@ treated as non-atomic.
A memory synchronization scope wider than work-group is not meaningful for the
group (LDS) address space and is treated as work-group.
+When a work-group's maximum flat work-group size does not exceed the wavefront
+size, the work-group fits within a single wavefront. In this case, LLVM
+``workgroup`` synchronization scope is equivalent to ``wavefront`` scope.
+
+If the compiler can determine this bound (e.g., via ``amdgpu-flat-work-group-size``),
+the AMDGPU backend optimizes ``workgroup`` scope operations by lowering them to
+``wavefront``-scoped machine instructions.
+
+It applies to atomic ``load``, ``store``, ``atomicrmw``, and ``cmpxchg``
+instructions, and to ``fence`` instructions, when they use synchronizing memory
+orderings (``acquire``, ``release``, ``acq_rel``, or ``seq_cst``).
+
+
The memory model does not support the region address space which is treated as
non-atomic.
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6b694291854fe..53c10d6700c5a 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -117,7 +117,8 @@ class SIMemOpInfo final {
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
bool IsVolatile = false, bool IsNonTemporal = false,
- bool IsLastUse = false, bool IsCooperative = false)
+ bool IsLastUse = false, bool IsCooperative = false,
+ const Function *ScopeDemotionFn = nullptr)
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
@@ -165,6 +166,19 @@ class SIMemOpInfo final {
// AGENT scope as a conservatively correct alternative.
if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
this->Scope = SIAtomicScope::AGENT;
+
+ // When max flat work-group size is at most the wavefront size, the
+ // work-group fits in a single wave, so LLVM workgroup scope matches
+ // wavefront scope. Demote workgroup → wavefront here for fences and for
+ // atomics with ordering stronger than monotonic.
+ if (ScopeDemotionFn && this->Scope == SIAtomicScope::WORKGROUP &&
+ (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
+ llvm::isStrongerThan(this->FailureOrdering,
+ AtomicOrdering::Monotonic)) &&
+ ST.getFlatWorkGroupSizes(*ScopeDemotionFn).second <=
+ ST.getWavefrontSize()) {
+ this->Scope = SIAtomicScope::WAVEFRONT;
+ }
}
public:
@@ -592,7 +606,7 @@ class SIMemoryLegalizer final {
MachineBasicBlock::iterator &MI);
/// Expands atomic fence operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
- bool expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
+ bool expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
/// instructions are added/deleted or \p MI is modified, false otherwise.
@@ -809,7 +823,8 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
- IsNonTemporal, IsLastUse, IsCooperative);
+ IsNonTemporal, IsLastUse, IsCooperative,
+ &MI->getMF()->getFunction());
}
std::optional<SIMemOpInfo>
@@ -878,7 +893,8 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
- AtomicOrdering::NotAtomic);
+ AtomicOrdering::NotAtomic, false, false, false, false,
+ &MI->getMF()->getFunction());
}
std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -2326,7 +2342,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
return Changed;
}
-bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpInfo &MOI,
+bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) {
assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
@@ -2335,21 +2351,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
- // When all waves of the workgroup fit in one wave, workgroup fences can be
- // lowered to wavefront scope.
- SIAtomicScope ScopeForFence = MOI.getScope();
- if (ScopeForFence == SIAtomicScope::WORKGROUP) {
- const Function &F = MI->getMF()->getFunction();
- const unsigned WGMaxSize = ST.getFlatWorkGroupSizes(F).second;
- if (WGMaxSize <= ST.getWavefrontSize())
- ScopeForFence = SIAtomicScope::WAVEFRONT;
- }
-
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
if (Order == AtomicOrdering::Acquire) {
// Acquire fences only need to wait on the previous atomic they pair with.
- Changed |= CC->insertWait(MI, ScopeForFence, OrderingAddrSpace,
+ Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE, Order, /*AtomicsOnly=*/true);
@@ -2365,7 +2371,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
/// generate a fence. Could add support in this file for
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
/// adding S_WAITCNT before a S_BARRIER.
- Changed |= CC->insertRelease(MI, ScopeForFence, OrderingAddrSpace,
+ Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
@@ -2377,7 +2383,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const GCNSubtarget &ST, const SIMemOpI
if (Order == AtomicOrdering::Acquire ||
Order == AtomicOrdering::AcquireRelease ||
Order == AtomicOrdering::SequentiallyConsistent)
- Changed |= CC->insertAcquire(MI, ScopeForFence, OrderingAddrSpace,
+ Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
Position::BEFORE);
return Changed;
@@ -2502,7 +2508,7 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
Changed |= expandLDSDMA(*MOI, MI);
else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
- Changed |= expandAtomicFence(ST,*MOI, MI);
+ Changed |= expandAtomicFence(*MOI, MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 275825a973415..9d0d43d900026 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -5916,7 +5916,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn:
@@ -5925,7 +5924,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn:
@@ -5935,7 +5933,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -5948,7 +5945,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128:
@@ -5960,7 +5956,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128:
@@ -5973,7 +5968,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6002,16 +5996,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB58_5
; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6045,16 +6041,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6084,11 +6082,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_4
; GFX950-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB58_5
; GFX950-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6097,6 +6094,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
@@ -6128,11 +6126,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_4
; GFX950-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB58_5
; GFX950-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6141,6 +6138,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
@@ -6182,16 +6180,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB59_5
; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6228,16 +6228,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6270,11 +6272,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_4
; GFX950-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB59_5
; GFX950-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6283,6 +6284,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
@@ -6317,11 +6319,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_4
; GFX950-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB59_5
; GFX950-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6330,6 +6331,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
@@ -6372,7 +6374,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
@@ -6410,7 +6411,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
@@ -6443,7 +6443,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6480,7 +6479,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6528,7 +6526,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
@@ -6569,7 +6566,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
@@ -6605,7 +6601,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6646,7 +6641,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6752,7 +6746,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn:
@@ -6761,7 +6754,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn:
@@ -6771,7 +6763,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6784,7 +6775,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128:
@@ -6796,7 +6786,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128:
@@ -6809,7 +6798,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -6838,16 +6826,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB66_5
; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6881,16 +6871,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6920,11 +6912,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_4
; GFX950-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB66_5
; GFX950-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6933,6 +6924,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
@@ -6964,11 +6956,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_4
; GFX950-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB66_5
; GFX950-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -6977,6 +6968,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
@@ -7018,16 +7010,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB67_5
; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7064,16 +7058,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7106,11 +7102,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_4
; GFX950-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB67_5
; GFX950-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7119,6 +7114,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
@@ -7153,11 +7149,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_4
; GFX950-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB67_5
; GFX950-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7166,6 +7161,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
@@ -7208,7 +7204,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
@@ -7246,7 +7241,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
@@ -7279,7 +7273,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7316,7 +7309,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7364,7 +7356,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
@@ -7405,7 +7396,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
@@ -7441,7 +7431,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7482,7 +7471,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7588,7 +7576,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn:
@@ -7597,7 +7584,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn:
@@ -7607,7 +7593,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -7620,7 +7605,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128:
@@ -7632,7 +7616,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128:
@@ -7645,7 +7628,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -7674,16 +7656,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB74_5
; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7717,16 +7701,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7756,11 +7742,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_4
; GFX950-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB74_5
; GFX950-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7769,6 +7754,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
@@ -7800,11 +7786,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_4
; GFX950-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB74_5
; GFX950-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7813,6 +7798,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
@@ -7854,16 +7840,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB75_5
; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7900,16 +7888,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7942,11 +7932,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_4
; GFX950-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB75_5
; GFX950-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -7955,6 +7944,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
@@ -7989,11 +7979,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_4
; GFX950-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB75_5
; GFX950-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8002,6 +7991,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8044,7 +8034,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
@@ -8082,7 +8071,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
@@ -8115,7 +8103,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8152,7 +8139,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8200,7 +8186,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
@@ -8241,7 +8226,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
@@ -8277,7 +8261,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8318,7 +8301,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8424,7 +8406,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn:
@@ -8433,7 +8414,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn:
@@ -8443,7 +8423,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -8456,7 +8435,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128:
@@ -8468,7 +8446,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128:
@@ -8481,7 +8458,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -8510,16 +8486,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB82_5
; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8553,16 +8531,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8592,11 +8572,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_4
; GFX950-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB82_5
; GFX950-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8605,6 +8584,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
@@ -8636,11 +8616,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_4
; GFX950-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB82_5
; GFX950-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8649,6 +8628,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8690,16 +8670,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB83_5
; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8736,16 +8718,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8778,11 +8762,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_4
; GFX950-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: s_branch .LBB83_5
; GFX950-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8791,6 +8774,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
@@ -8825,11 +8809,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_4
; GFX950-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_branch .LBB83_5
; GFX950-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8838,6 +8821,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-GISEL-NEXT: s_nop 1
; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
@@ -8880,7 +8864,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
@@ -8918,7 +8901,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
@@ -8951,7 +8933,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -8988,7 +8969,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -9036,7 +9016,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
@@ -9077,7 +9056,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
@@ -9113,7 +9091,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-SDAG-NEXT: s_endpgm
; GFX950-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
@@ -9154,7 +9131,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX950-GISEL-NEXT: s_endpgm
; GFX950-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5]
-; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 58f7c4340276d..466d7152a0d84 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -2142,31 +2142,22 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i32_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i32_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_max_saddr_i32_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2176,31 +2167,22 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i32_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2211,30 +2193,19 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
}
define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i32_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_max_saddr_i32_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax v0, v1, s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_max_saddr_i32_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2243,30 +2214,19 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_max_saddr_i32_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2276,31 +2236,22 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
}
define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i64_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i64_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_max_saddr_i64_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2310,31 +2261,22 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
}
define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_max_saddr_i64_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2345,30 +2287,19 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
}
define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i64_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_max_saddr_i64_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_max_saddr_i64_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2377,30 +2308,19 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_max_saddr_i64_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2414,31 +2334,22 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i32_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i32_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_min_saddr_i32_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2448,31 +2359,22 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i32_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2483,30 +2385,19 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
}
define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i32_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_min_saddr_i32_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin v0, v1, s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_min_saddr_i32_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2515,30 +2406,19 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_min_saddr_i32_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2548,31 +2428,22 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
}
define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i64_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i64_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_min_saddr_i64_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2582,31 +2453,22 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
}
define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_min_saddr_i64_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2617,30 +2479,19 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
}
define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i64_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_min_saddr_i64_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_min_saddr_i64_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2649,30 +2500,19 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_min_saddr_i64_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2686,31 +2526,22 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i32_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i32_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umax_saddr_i32_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2720,31 +2551,22 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i32_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2755,30 +2577,19 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
}
define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i32_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umax_saddr_i32_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax v0, v1, s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umax_saddr_i32_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2787,30 +2598,19 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
}
define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umax_saddr_i32_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2820,31 +2620,22 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
}
define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i64_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i64_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umax_saddr_i64_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2854,31 +2645,22 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
}
define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umax_saddr_i64_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2889,30 +2671,19 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
}
define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i64_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umax_saddr_i64_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umax_saddr_i64_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2921,30 +2692,19 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
}
define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umax_saddr_i64_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2958,31 +2718,22 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i32_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i32_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i32_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umin_saddr_i32_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -2992,31 +2743,22 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
}
define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i32_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3027,30 +2769,19 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
}
define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i32_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umin_saddr_i32_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin v0, v1, s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i32_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umin_saddr_i32_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3059,30 +2790,19 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
}
define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
-; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umin_saddr_i32_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3092,31 +2812,22 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
}
define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_rtn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i64_rtn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i64_rtn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i64_rtn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umin_saddr_i64_rtn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3126,31 +2837,22 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
}
define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: global_umin_saddr_i64_rtn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3161,30 +2863,19 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
}
define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_nortn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3]
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i64_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3]
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umin_saddr_i64_nortn:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3]
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i64_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umin_saddr_i64_nortn:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3]
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
@@ -3193,30 +2884,19 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
}
define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
-; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX9-NEXT: s_endpgm
-;
-; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: global_umin_saddr_i64_nortn_neg128:
+; GCN: ; %bb.0:
+; GCN-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
+; GCN-NEXT: s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
deleted file mode 100644
index 151d326d5cb73..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-fence.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
-
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE32
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1201 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=CHECK,WAVE64
-
-define amdgpu_kernel void @single_wave_workgroup32() #0 {
- ; CHECK-LABEL: name: single_wave_workgroup32
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: S_ENDPGM 0
- fence syncscope("workgroup") acq_rel
- ret void
-}
-
-define amdgpu_kernel void @single_wave_workgroup64() #1 {
- ; WAVE32-LABEL: name: single_wave_workgroup64
- ; WAVE32: bb.0 (%ir-block.0):
- ; WAVE32-NEXT: S_WAIT_BVHCNT_soft 0
- ; WAVE32-NEXT: S_WAIT_SAMPLECNT_soft 0
- ; WAVE32-NEXT: S_WAIT_LOADCNT_soft 0
- ; WAVE32-NEXT: S_WAIT_STORECNT_soft 0
- ; WAVE32-NEXT: S_WAIT_DSCNT_soft 0
- ; WAVE32-NEXT: GLOBAL_INV 8, implicit $exec
- ; WAVE32-NEXT: S_ENDPGM 0
- ;
- ; WAVE64-LABEL: name: single_wave_workgroup64
- ; WAVE64: bb.0 (%ir-block.0):
- ; WAVE64-NEXT: S_ENDPGM 0
- fence syncscope("workgroup") acq_rel
- ret void
-}
-
-define amdgpu_kernel void @multi_wave_workgroup() #2 {
- ; CHECK-LABEL: name: multi_wave_workgroup
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: S_WAIT_BVHCNT_soft 0
- ; CHECK-NEXT: S_WAIT_SAMPLECNT_soft 0
- ; CHECK-NEXT: S_WAIT_LOADCNT_soft 0
- ; CHECK-NEXT: S_WAIT_STORECNT_soft 0
- ; CHECK-NEXT: S_WAIT_DSCNT_soft 0
- ; CHECK-NEXT: GLOBAL_INV 8, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0
- fence syncscope("workgroup") acq_rel
- ret void
-}
-
-attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
-attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
-attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
new file mode 100644
index 0000000000000..aaa295992c361
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll
@@ -0,0 +1,2759 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W32
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W64
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX1250
+
+
+define amdgpu_kernel void @wg_fence_acq_rel_single32() #0 {
+ ; GFX9-LABEL: name: wg_fence_acq_rel_single32
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_fence_acq_rel_single32
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_fence_acq_rel_single32
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: wg_fence_acq_rel_single32
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_fence_acq_rel_single32
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @wg_fence_acq_rel_single64() #1 {
+ ; GFX9-LABEL: name: wg_fence_acq_rel_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_fence_acq_rel_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_fence_acq_rel_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_fence_acq_rel_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_fence_acq_rel_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_fence_acq_rel_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_fence_acq_rel_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @wg_fence_acq_rel_multi() #2 {
+ ; GFX9-LABEL: name: wg_fence_acq_rel_multi
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_fence_acq_rel_multi
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_fence_acq_rel_multi
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: S_WAITCNT_soft 112
+ ; GFX10-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: wg_fence_acq_rel_multi
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_fence_acq_rel_multi
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @wg_fence_acquire_single64() #1 {
+ ; GFX9-LABEL: name: wg_fence_acquire_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_fence_acquire_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_fence_acquire_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_fence_acquire_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_fence_acquire_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_fence_acquire_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_fence_acquire_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") acquire
+ ret void
+}
+
+define amdgpu_kernel void @wg_fence_release_single64() #1 {
+ ; GFX9-LABEL: name: wg_fence_release_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_fence_release_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_fence_release_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_fence_release_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_fence_release_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_fence_release_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_fence_release_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") release
+ ret void
+}
+
+define amdgpu_kernel void @wg_fence_seq_cst_single64() #1 {
+ ; GFX9-LABEL: name: wg_fence_seq_cst_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_fence_seq_cst_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_fence_seq_cst_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_fence_seq_cst_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_fence_seq_cst_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_fence_seq_cst_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_fence_seq_cst_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ fence syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_single32(ptr addrspace(1) %p) #0 {
+ ; GFX9-LABEL: name: wg_ld_seq_cst_single32
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_ld_seq_cst_single32
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_ld_seq_cst_single32
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: wg_ld_seq_cst_single32
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_ld_seq_cst_single32
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_single64(ptr addrspace(1) %p) #1 {
+ ; GFX9-LABEL: name: wg_ld_seq_cst_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_ld_seq_cst_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_ld_seq_cst_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 16240
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_ld_seq_cst_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_ld_seq_cst_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_ld_seq_cst_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_ld_seq_cst_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 {
+ ; GFX9-LABEL: name: wg_ld_seq_cst_multi
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_ld_seq_cst_multi
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_ld_seq_cst_multi
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-NEXT: S_WAITCNT_soft 112
+ ; GFX10-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX10-NEXT: S_WAITCNT_soft 16240
+ ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: wg_ld_seq_cst_multi
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_ld_seq_cst_multi
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_ld_acquire_single64(ptr addrspace(1) %p) #1 {
+ ; GFX9-LABEL: name: wg_ld_acquire_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_ld_acquire_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_ld_acquire_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 16240
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_ld_acquire_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_ld_acquire_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_ld_acquire_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_ld_acquire_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") acquire, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_ld_monotonic_single64(ptr addrspace(1) %p) #1 {
+ ; GFX9-LABEL: name: wg_ld_monotonic_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_ld_monotonic_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_ld_monotonic_single64
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: wg_ld_monotonic_single64
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_ld_monotonic_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_single32(ptr addrspace(1) %p, i32 %x) #0 {
+ ; GFX9-LABEL: name: wg_st_seq_cst_single32
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_st_seq_cst_single32
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_st_seq_cst_single32
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_st_seq_cst_single32
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_st_seq_cst_single32
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_st_seq_cst_single32
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_single64(ptr addrspace(1) %p, i32 %x) #1 {
+ ; GFX9-LABEL: name: wg_st_seq_cst_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_st_seq_cst_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_st_seq_cst_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_st_seq_cst_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_st_seq_cst_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_st_seq_cst_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_st_seq_cst_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 {
+ ; GFX9-LABEL: name: wg_st_seq_cst_multi
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_st_seq_cst_multi
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 1, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_st_seq_cst_multi
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-NEXT: S_WAITCNT_soft 112
+ ; GFX10-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_st_seq_cst_multi
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_st_seq_cst_multi
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_st_seq_cst_multi
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_st_release_single64(ptr addrspace(1) %p, i32 %x) #1 {
+ ; GFX9-LABEL: name: wg_st_release_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_st_release_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_st_release_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_st_release_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_st_release_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_st_release_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_st_release_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_single32(ptr addrspace(1) %p) #0 {
+ ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single32
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.1 (%ir-block.11):
+ ; GFX9-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.2 (%ir-block.16):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single32
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.1 (%ir-block.11):
+ ; GFX942-NEXT: successors: %bb.2(0x80000000)
+ ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.2 (%ir-block.16):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single32
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+ ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single32
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single32
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single32
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single32
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.1 (%ir-block.7):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2 (%ir-block.11):
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_single64(ptr addrspace(1) %p) #1 {
+ ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.1 (%ir-block.11):
+ ; GFX9-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.2 (%ir-block.16):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.1 (%ir-block.11):
+ ; GFX942-NEXT: successors: %bb.2(0x80000000)
+ ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.2 (%ir-block.16):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+ ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.1 (%ir-block.7):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2 (%ir-block.11):
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 {
+ ; GFX9-LABEL: name: wg_rmw_add_seq_cst_multi
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.1 (%ir-block.11):
+ ; GFX9-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.2 (%ir-block.16):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_rmw_add_seq_cst_multi
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.1 (%ir-block.11):
+ ; GFX942-NEXT: successors: %bb.2(0x80000000)
+ ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.2 (%ir-block.16):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_multi
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+ ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_multi
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W64-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W64-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W64-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_multi
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_multi
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W64-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_multi
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.1 (%ir-block.7):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2 (%ir-block.11):
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @wg_rmw_xchg_acq_rel_single64(ptr addrspace(1) %p, i32 %x) #1 {
+ ; GFX9-LABEL: name: wg_rmw_xchg_acq_rel_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_rmw_xchg_acq_rel_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_rmw_xchg_acq_rel_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = atomicrmw xchg ptr addrspace(1) %p, i32 %x syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
+ ; GFX9-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p, i32 %cmp, i32 %new) #2 {
+ ; GFX9-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-NEXT: S_WAITCNT_soft 112
+ ; GFX10-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W64-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @wg_cmpxchg_acquire_acquire_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 {
+ ; GFX9-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: wg_cmpxchg_acquire_acquire_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_single32(ptr addrspace(3) %p) #0 {
+ ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single32
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single32
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: lds_wg_ld_seq_cst_single32
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: lds_wg_ld_seq_cst_single32
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX12-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single32
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_single64(ptr addrspace(3) %p) #1 {
+ ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: lds_wg_ld_seq_cst_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 49279
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: lds_wg_ld_seq_cst_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: lds_wg_ld_seq_cst_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: lds_wg_ld_seq_cst_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 {
+ ; GFX9-LABEL: name: lds_wg_ld_seq_cst_multi
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_ld_seq_cst_multi
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: lds_wg_ld_seq_cst_multi
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-NEXT: S_WAITCNT_soft 112
+ ; GFX10-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX10-NEXT: S_WAITCNT_soft 49279
+ ; GFX10-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: lds_wg_ld_seq_cst_multi
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX12-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX12-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_multi
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3)
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_st_release_single64(ptr addrspace(3) %p, i32 %x) #1 {
+ ; GFX9-LABEL: name: lds_wg_st_release_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX9-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_st_release_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX942-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: lds_wg_st_release_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: lds_wg_st_release_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: lds_wg_st_release_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: lds_wg_st_release_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_st_release_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ store atomic i32 %x, ptr addrspace(3) %p syncscope("workgroup") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_rmw_add_acq_rel_single64(ptr addrspace(3) %p) #1 {
+ ; GFX9-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.1 (%ir-block.11):
+ ; GFX9-NEXT: successors: %bb.2(0x80000000)
+ ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: bb.2 (%ir-block.16):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.1 (%ir-block.11):
+ ; GFX942-NEXT: successors: %bb.2(0x80000000)
+ ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: bb.2 (%ir-block.16):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+ ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX10-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 49279
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX10-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.1 (%ir-block.7):
+ ; GFX12-W32-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+ ; GFX12-W32-NEXT: $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: bb.2 (%ir-block.11):
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec
+ ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.1 (%ir-block.11):
+ ; GFX12-W64-NEXT: successors: %bb.2(0x80000000)
+ ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: bb.2 (%ir-block.16):
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_rmw_add_acq_rel_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec
+ ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec
+ ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.1 (%ir-block.7):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4)
+ ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc
+ ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3
+ ; GFX1250-NEXT: $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3)
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2 (%ir-block.11):
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = atomicrmw add ptr addrspace(3) %p, i32 3 syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
+ ; GFX9-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-W32-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 49279
+ ; GFX10-W32-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3)
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_ld_unordered_single64(ptr addrspace(3) %p) #1 {
+ ; GFX9-LABEL: name: lds_wg_ld_unordered_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_ld_unordered_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: lds_wg_ld_unordered_single64
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: lds_wg_ld_unordered_single64
+ ; GFX12: bb.0 (%ir-block.0):
+ ; GFX12-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_ld_unordered_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_wg_cmpxchg_monotonic_acquire_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 {
+ ; GFX9-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W32-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 49279
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-W64-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3)
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @flat_wg_ld_acquire_single64(ptr addrspace(0) %p) #1 {
+ ; GFX9-LABEL: name: flat_wg_ld_acquire_single64
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX9-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: flat_wg_ld_acquire_single64
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
+ ; GFX942-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W32-LABEL: name: flat_wg_ld_acquire_single64
+ ; GFX10-W32: bb.0 (%ir-block.0):
+ ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W32-NEXT: {{ $}}
+ ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+ ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX10-W32-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 1, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+ ; GFX10-W32-NEXT: S_WAITCNT_soft 112
+ ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec
+ ; GFX10-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-W64-LABEL: name: flat_wg_ld_acquire_single64
+ ; GFX10-W64: bb.0 (%ir-block.0):
+ ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-W64-NEXT: {{ $}}
+ ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+ ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX10-W64-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+ ; GFX10-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: flat_wg_ld_acquire_single64
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX12-W32-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 8, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: flat_wg_ld_acquire_single64
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX12-W64-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: flat_wg_ld_acquire_single64
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX1250-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load)
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ %v = load atomic i32, ptr addrspace(0) %p syncscope("workgroup") acquire, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x) #2 {
+ ; GFX9-LABEL: name: flat_wg_st_seq_cst_multi
+ ; GFX9: bb.0 (%ir-block.0):
+ ; GFX9-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+ ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT_soft 49279
+ ; GFX9-NEXT: S_WAITCNT_lds_direct
+ ; GFX9-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+ ; GFX9-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: flat_wg_st_seq_cst_multi
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT_soft 49279
+ ; GFX942-NEXT: S_WAITCNT_lds_direct
+ ; GFX942-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 1, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX10-LABEL: name: flat_wg_st_seq_cst_multi
+ ; GFX10: bb.0 (%ir-block.0):
+ ; GFX10-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
+ ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+ ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX10-NEXT: S_WAITCNT_soft 112
+ ; GFX10-NEXT: S_WAITCNT_lds_direct
+ ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
+ ; GFX10-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+ ; GFX10-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W32-LABEL: name: flat_wg_st_seq_cst_multi
+ ; GFX12-W32: bb.0 (%ir-block.0):
+ ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W32-NEXT: {{ $}}
+ ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W32-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+ ; GFX12-W32-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-W64-LABEL: name: flat_wg_st_seq_cst_multi
+ ; GFX12-W64: bb.0 (%ir-block.0):
+ ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX12-W64-NEXT: {{ $}}
+ ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+ ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+ ; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+ ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX12-W64-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+ ; GFX12-W64-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: flat_wg_st_seq_cst_multi
+ ; GFX1250: bb.0 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode
+ ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4)
+ ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec
+ ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0
+ ; GFX1250-NEXT: S_WAIT_XCNT_soft 0
+ ; GFX1250-NEXT: FLAT_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ store atomic i32 %x, ptr addrspace(0) %p syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }
More information about the llvm-commits
mailing list