[llvm] [RFC][AMDGPU] Add vulkan:private/nonprivate MMRAs support (PR #78573)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Tue May 28 23:28:24 PDT 2024
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/78573
>From dd92e93623746de8465c7e65e0da9f15938ddc0b Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 17 Jan 2024 11:28:54 +0100
Subject: [PATCH] [RFC][AMDGPU] Add `vulkan:private`/`nonprivate` MMRAs support
Allows Vulkan front-ends to properly implement the Vulkan memory model.
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 221 +-
.../memory-legalizer-mmra-vulkan-default.ll | 2550 +++++++++
...memory-legalizer-mmra-vulkan-nonprivate.ll | 4599 +++++++++++++++++
.../memory-legalizer-mmra-vulkan-private.ll | 414 ++
4 files changed, 7722 insertions(+), 62 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-default.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-nonprivate.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-private.ll
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 24f8788683ed7..919a2b9e0ec70 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -10,6 +10,20 @@
/// Memory legalizer - implements memory model. More information can be
/// found here:
/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
+///
+/// Notes on the Vulkan Memory Model, which is controlled by the
+/// `amdgpu.vulkan.memory-model` module flag. When that flag
+/// is provided:
+/// - vulkan:private operations cannot be atomic, and as such
+/// their codegen is not affected.
+/// - vulkan:nonprivate bypasses all caches not coherent for GFXIP.
+/// - vulkan:nonprivate does not need to use invalidates or writeback
+/// operations as we bypass non-coherent caches in the first place.
+/// - unannotated non-atomic load/stores also bypass caches
+///
+/// TODO: The above doc should simply be a separate section in the memory model
+/// documentation. It can take the form of a table noting the changes
+/// to each target.
//
//===----------------------------------------------------------------------===//
@@ -40,6 +54,13 @@ namespace {
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+// Vulkan-specific modifiers.
+enum class VulkanOpKind {
+ None,
+ NonPrivate,
+ Private,
+};
+
/// Memory operation flags. Can be ORed together.
enum class SIMemOp {
NONE = 0u,
@@ -97,6 +118,7 @@ class SIMemOpInfo final {
SIAtomicScope Scope = SIAtomicScope::SYSTEM;
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
+ VulkanOpKind VKOK = VulkanOpKind::None;
bool IsCrossAddressSpaceOrdering = false;
bool IsVolatile = false;
bool IsNonTemporal = false;
@@ -109,11 +131,11 @@ class SIMemOpInfo final {
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
bool IsCrossAddressSpaceOrdering = true,
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
- bool IsVolatile = false, bool IsNonTemporal = false,
- bool IsLastUse = false)
+ VulkanOpKind VKOK = VulkanOpKind::None, bool IsVolatile = false,
+ bool IsNonTemporal = false, bool IsLastUse = false)
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
- IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+ VKOK(VKOK), IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
IsLastUse(IsLastUse) {
@@ -155,6 +177,9 @@ class SIMemOpInfo final {
}
public:
+ /// \returns the type of vulkan operation this is.
+ VulkanOpKind getVulkanOpKind() const { return VKOK; }
+
/// \returns Atomic synchronization scope of the machine instruction used to
/// create this SIMemOpInfo.
SIAtomicScope getScope() const {
@@ -218,6 +243,7 @@ class SIMemOpInfo final {
class SIMemOpAccess final {
private:
AMDGPUMachineModuleInfo *MMI = nullptr;
+ bool UsesVulkanMemoryModel = false;
/// Reports unsupported message \p Msg for \p MI to LLVM context.
void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -238,10 +264,12 @@ class SIMemOpAccess final {
std::optional<SIMemOpInfo>
constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
+ VulkanOpKind getVulkanOpKind(const MachineInstr &MI) const;
+
public:
/// Construct class to support accessing the machine memory operands
/// of instructions in the machine function \p MF.
- SIMemOpAccess(MachineFunction &MF);
+ SIMemOpAccess(MachineFunction &MF, bool UsesVulkanMemoryModel);
/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
@@ -285,10 +313,12 @@ class SICacheControl {
AMDGPU::CPol::CPol Bit) const;
public:
-
/// Create a cache control for the subtarget \p ST.
static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
+ /// Whether this target supports the Vulkan memory model.
+ virtual bool supportsVulkanMM() const { return true; }
+
/// Update \p MI memory load instruction to bypass any caches up to
/// the \p Scope memory scope for address spaces \p
/// AddrSpace. Return true iff the instruction was modified.
@@ -343,9 +373,8 @@ class SICacheControl {
/// operations by any thread for memory scopes up to memory scope \p Scope .
/// Returns true iff any instructions inserted.
virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const = 0;
+ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace,
+ Position Pos, VulkanOpKind VKOK) const = 0;
/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure previous memory instructions by this thread
@@ -357,7 +386,7 @@ class SICacheControl {
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
- Position Pos) const = 0;
+ Position Pos, VulkanOpKind VKOK) const = 0;
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
@@ -411,16 +440,15 @@ class SIGfx6CacheControl : public SICacheControl {
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos,
+ VulkanOpKind VKOK) const override;
bool insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
+ Position Pos, VulkanOpKind VKOK) const override;
};
class SIGfx7CacheControl : public SIGfx6CacheControl {
@@ -428,11 +456,9 @@ class SIGfx7CacheControl : public SIGfx6CacheControl {
SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos,
+ VulkanOpKind VKOK) const override;
};
class SIGfx90ACacheControl : public SIGfx7CacheControl {
@@ -440,6 +466,9 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
+ // gfx90a is an accelerator.
+ bool supportsVulkanMM() const override { return false; }
+
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const override;
@@ -464,16 +493,15 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos,
+ VulkanOpKind VKOK) const override;
bool insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
+ Position Pos, VulkanOpKind VKOK) const override;
};
class SIGfx940CacheControl : public SIGfx90ACacheControl {
@@ -519,11 +547,12 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl {
bool IsLastUse) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+ SIAtomicAddrSpace AddrSpace, Position Pos,
+ VulkanOpKind VKOK) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
- Position Pos) const override;
+ Position Pos, VulkanOpKind VKOK) const override;
bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI) const override {
@@ -569,10 +598,9 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos,
+ VulkanOpKind VKOK) const override;
};
class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -617,7 +645,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsCrossAddrSpaceOrdering, Position Pos) const override;
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+ SIAtomicAddrSpace AddrSpace, Position Pos,
+ VulkanOpKind VKOK) const override;
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -629,6 +658,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
+ bool UsesVulkanMemoryModel = false;
/// Cache Control.
std::unique_ptr<SICacheControl> CC = nullptr;
@@ -781,7 +811,8 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::OTHER;
}
-SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF, bool UsesVulkanMemoryModel)
+ : UsesVulkanMemoryModel(UsesVulkanMemoryModel) {
MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
}
@@ -843,8 +874,24 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
}
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
- IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
- IsNonTemporal, IsLastUse);
+ IsCrossAddressSpaceOrdering, FailureOrdering,
+ getVulkanOpKind(*MI), IsVolatile, IsNonTemporal,
+ IsLastUse);
+}
+
+VulkanOpKind SIMemOpAccess::getVulkanOpKind(const MachineInstr &MI) const {
+
+ // TODO: Warn if annotation found, but Vulkan MM not enabled?
+ if (UsesVulkanMemoryModel) {
+ if (auto MMRA = MMRAMetadata(MI.getMMRAMetadata())) {
+ if (MMRA.hasTag("vulkan", "private"))
+ return VulkanOpKind::Private;
+ else if (MMRA.hasTag("vulkan", "nonprivate"))
+ return VulkanOpKind::NonPrivate;
+ }
+ }
+
+ return VulkanOpKind::None;
}
std::optional<SIMemOpInfo>
@@ -904,8 +951,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
return std::nullopt;
}
- return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
- IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
+ return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace,
+ SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
+ AtomicOrdering::NotAtomic, getVulkanOpKind(*MI));
}
std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -1165,8 +1213,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
+ Position Pos, VulkanOpKind VKOK) const {
+ if (!InsertCacheInv || VKOK == VulkanOpKind::NonPrivate)
return false;
bool Changed = false;
@@ -1211,7 +1259,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
+ Position Pos, VulkanOpKind VKOK) const {
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos);
}
@@ -1219,8 +1267,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
+ Position Pos, VulkanOpKind VKOK) const {
+ if (!InsertCacheInv || VKOK == VulkanOpKind::NonPrivate)
return false;
bool Changed = false;
@@ -1448,7 +1496,9 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
+ Position Pos,
+ VulkanOpKind VKOK) const {
+ assert(VKOK == VulkanOpKind::None && "Vulkan MM not supported on gfx90a");
if (!InsertCacheInv)
return false;
@@ -1506,7 +1556,7 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
--MI;
- Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
+ Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos, VKOK);
return Changed;
}
@@ -1515,7 +1565,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
+ Position Pos, VulkanOpKind VKOK) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -1556,7 +1606,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
Changed |=
SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
- IsCrossAddrSpaceOrdering, Pos);
+ IsCrossAddrSpaceOrdering, Pos, VKOK);
return Changed;
}
@@ -1717,7 +1767,9 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
+ Position Pos,
+ VulkanOpKind VKOK) const {
+ assert(VKOK == VulkanOpKind::None && "Vulkan MM not supported on gfx90a");
if (!InsertCacheInv)
return false;
@@ -1804,7 +1856,7 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
bool IsCrossAddrSpaceOrdering,
- Position Pos) const {
+ Position Pos, VulkanOpKind VKOK) const {
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -2081,8 +2133,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
+ Position Pos, VulkanOpKind VKOK) const {
+ if (!InsertCacheInv || VKOK == VulkanOpKind::NonPrivate)
return false;
bool Changed = false;
@@ -2376,8 +2428,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
- Position Pos) const {
- if (!InsertCacheInv)
+ Position Pos, VulkanOpKind VKOK) const {
+ if (!InsertCacheInv || VKOK == VulkanOpKind::NonPrivate)
return false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -2518,14 +2570,29 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
SIMemOp::LOAD,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
- Changed |= CC->insertAcquire(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ Changed |=
+ CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
+ Position::AFTER, MOI.getVulkanOpKind());
}
return Changed;
}
+ switch (MOI.getVulkanOpKind()) {
+ case VulkanOpKind::None:
+ // (vulkan): non-annotated, non-atomic stores bypass caches.
+ if (!UsesVulkanMemoryModel || MOI.isAtomic())
+ break;
+ [[fallthrough]];
+ case VulkanOpKind::NonPrivate:
+ Changed |= CC->enableLoadCacheBypass(MI, SIAtomicScope::SYSTEM,
+ MOI.getInstrAddrSpace());
+ break;
+ case VulkanOpKind::Private:
+ assert(!MOI.isAtomic() && "vulkan:private on atomics does not make sense");
+ break;
+ }
+
// Atomic instructions already bypass caches to the scope specified by the
// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
// instructions need additional treatment.
@@ -2555,11 +2622,26 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Position::BEFORE, MOI.getVulkanOpKind());
return Changed;
}
+ switch (MOI.getVulkanOpKind()) {
+ case VulkanOpKind::None:
+ // (vulkan): non-annotated, non-atomic stores bypass caches.
+ if (!UsesVulkanMemoryModel || MOI.isAtomic())
+ break;
+ [[fallthrough]];
+ case VulkanOpKind::NonPrivate:
+ Changed |= CC->enableStoreCacheBypass(MI, SIAtomicScope::SYSTEM,
+ MOI.getInstrAddrSpace());
+ break;
+ case VulkanOpKind::Private:
+ assert(!MOI.isAtomic() && "vulkan:private on atomics does not make sense");
+ break;
+ }
+
// Atomic instructions already bypass caches to the scope specified by the
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
// need additional treatment.
@@ -2604,7 +2686,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
/// adding S_WAITCNT before a S_BARRIER.
Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Position::BEFORE, MOI.getVulkanOpKind());
// TODO: If both release and invalidate are happening they could be combined
// to use the single "BUFFER_WBINV*" instruction. This could be done by
@@ -2615,7 +2697,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
- Position::BEFORE);
+ Position::BEFORE, MOI.getVulkanOpKind());
return Changed;
}
@@ -2629,6 +2711,8 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
bool Changed = false;
+ // TODO: For Vulkan, how do we approach this? enableRMWCacheBypass is just for 90a/940 which don't support vulkan MM.
+ // Needs more investigation for targets implementing the vulkan MM.
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
MOI.getOrdering() == AtomicOrdering::Acquire ||
@@ -2646,7 +2730,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Changed |= CC->insertRelease(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
MOI.getIsCrossAddressSpaceOrdering(),
- Position::BEFORE);
+ Position::BEFORE, MOI.getVulkanOpKind());
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
@@ -2659,9 +2743,9 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER);
- Changed |= CC->insertAcquire(MI, MOI.getScope(),
- MOI.getOrderingAddrSpace(),
- Position::AFTER);
+ Changed |=
+ CC->insertAcquire(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
+ Position::AFTER, MOI.getVulkanOpKind());
}
return Changed;
@@ -2670,12 +2754,25 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}
+static bool isModuleUsingVulkanMM(MachineFunction &MF) {
+ Module *M = MF.getFunction().getParent();
+ if (auto *MD = mdconst::extract_or_null<ConstantInt>(
+ M->getModuleFlag("amdgpu.vulkan.memory-model")))
+ return MD->getZExtValue() == 1;
+ return false;
+}
+
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
+ UsesVulkanMemoryModel = isModuleUsingVulkanMM(MF);
- SIMemOpAccess MOA(MF);
+ SIMemOpAccess MOA(MF, UsesVulkanMemoryModel);
CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
+ if (UsesVulkanMemoryModel && !CC->supportsVulkanMM())
+ report_fatal_error("target does not support the Vulkan Memory Model!",
+ false);
+
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-default.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-default.ll
new file mode 100644
index 0000000000000..9c57437f81560
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-default.ll
@@ -0,0 +1,2550 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s
+
+define amdgpu_kernel void @workgroup_acquire_fence() {
+; GFX6-LABEL: workgroup_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") acquire
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_release_fence() {
+; GFX6-LABEL: workgroup_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") release
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_acq_rel_fence() {
+; GFX6-LABEL: workgroup_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_seq_cst_fence() {
+; GFX6-LABEL: workgroup_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
+; GFX6-LABEL: workgroup_one_as_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_release_fence() {
+; GFX6-LABEL: workgroup_one_as_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
+; GFX6-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
+; GFX6-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @agent_acquire_fence() {
+; GFX6-LABEL: agent_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") acquire
+ ret void
+}
+
+define amdgpu_kernel void @agent_release_fence() {
+; GFX6-LABEL: agent_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") release
+ ret void
+}
+
+define amdgpu_kernel void @agent_acq_rel_fence() {
+; GFX6-LABEL: agent_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @agent_seq_cst_fence() {
+; GFX6-LABEL: agent_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_acquire_fence() {
+; GFX6-LABEL: agent_one_as_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_release_fence() {
+; GFX6-LABEL: agent_one_as_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
+; GFX6-LABEL: agent_one_as_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
+; GFX6-LABEL: agent_one_as_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @system_acquire_fence() {
+; GFX6-LABEL: system_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_invl2
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence acquire
+ ret void
+}
+
+define amdgpu_kernel void @system_release_fence() {
+; GFX6-LABEL: system_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence release
+ ret void
+}
+
+define amdgpu_kernel void @system_acq_rel_fence() {
+; GFX6-LABEL: system_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: buffer_invl2
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @system_seq_cst_fence() {
+; GFX6-LABEL: system_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: buffer_invl2
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_acquire_fence() {
+; GFX6-LABEL: system_one_as_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_invl2
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_release_fence() {
+; GFX6-LABEL: system_one_as_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_acq_rel_fence() {
+; GFX6-LABEL: system_one_as_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: buffer_invl2
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_seq_cst_fence() {
+; GFX6-LABEL: system_one_as_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: buffer_invl2
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") seq_cst
+ ret void
+}
+
+
+define amdgpu_kernel void @flat_unordered_load(
+; GFX7-LABEL: flat_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_unordered_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") unordered, align 4
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_monotonic_load(
+; GFX7-LABEL: flat_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_monotonic_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_acquire_load(
+; GFX7-LABEL: flat_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_acquire_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") acquire, align 4
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_seq_cst_load(
+; GFX7-LABEL: flat_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: buffer_gl1_inv
+; GFX10-WGP-NEXT: buffer_gl0_inv
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: buffer_gl0_inv
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_seq_cst_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_unordered_store(
+; GFX7-LABEL: flat_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_unordered_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_monotonic_store(
+; GFX7-LABEL: flat_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_monotonic_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_release_store(
+; GFX7-LABEL: flat_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_release_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_seq_cst_store(
+; GFX7-LABEL: flat_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_seq_cst_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+; TODO: test more address spaces. (only flat currently)
+; TODO: test non-atomic load/stores more.
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu.vulkan.memory-model", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-nonprivate.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-nonprivate.ll
new file mode 100644
index 0000000000000..b3215247e0030
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-nonprivate.ll
@@ -0,0 +1,4599 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s
+
+; TODO: Test other atomics?
+
+define amdgpu_kernel void @workgroup_acquire_fence() {
+; GFX6-LABEL: workgroup_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") acquire, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_release_fence() {
+; GFX6-LABEL: workgroup_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") release, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_acq_rel_fence() {
+; GFX6-LABEL: workgroup_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") acq_rel, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_seq_cst_fence() {
+; GFX6-LABEL: workgroup_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") seq_cst, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
+; GFX6-LABEL: workgroup_one_as_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") acquire, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_release_fence() {
+; GFX6-LABEL: workgroup_one_as_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") release, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
+; GFX6-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") acq_rel, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
+; GFX6-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup-one-as") seq_cst, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_acquire_fence() {
+; GFX6-LABEL: agent_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") acquire, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_release_fence() {
+; GFX6-LABEL: agent_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") release, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_acq_rel_fence() {
+; GFX6-LABEL: agent_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") acq_rel, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_seq_cst_fence() {
+; GFX6-LABEL: agent_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") seq_cst, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_acquire_fence() {
+; GFX6-LABEL: agent_one_as_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") acquire, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_release_fence() {
+; GFX6-LABEL: agent_one_as_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") release, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
+; GFX6-LABEL: agent_one_as_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") acq_rel, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
+; GFX6-LABEL: agent_one_as_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: agent_one_as_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: agent_one_as_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("agent-one-as") seq_cst, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_acquire_fence() {
+; GFX6-LABEL: system_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence acquire, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_release_fence() {
+; GFX6-LABEL: system_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence release, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_acq_rel_fence() {
+; GFX6-LABEL: system_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence acq_rel, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_seq_cst_fence() {
+; GFX6-LABEL: system_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence seq_cst, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_acquire_fence() {
+; GFX6-LABEL: system_one_as_acquire_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_acquire_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_acquire_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_acquire_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_acquire_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_acquire_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") acquire, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_release_fence() {
+; GFX6-LABEL: system_one_as_release_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_release_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_release_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_release_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_release_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_release_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") release, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_acq_rel_fence() {
+; GFX6-LABEL: system_one_as_acq_rel_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_acq_rel_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_acq_rel_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_acq_rel_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") acq_rel, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @system_one_as_seq_cst_fence() {
+; GFX6-LABEL: system_one_as_seq_cst_fence:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: system_one_as_seq_cst_fence:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: system_one_as_seq_cst_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: system_one_as_seq_cst_fence:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: buffer_wbl2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+entry:
+ fence syncscope("one-as") seq_cst, !mmra !0
+ ret void
+}
+
+
+define amdgpu_kernel void @flat_unordered_load(
+; GFX7-LABEL: flat_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_unordered_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") unordered, align 4, !mmra !0
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_monotonic_load(
+; GFX7-LABEL: flat_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_monotonic_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4, !mmra !0
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_acquire_load(
+; GFX7-LABEL: flat_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_acquire_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") acquire, align 4, !mmra !0
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_seq_cst_load(
+; GFX7-LABEL: flat_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_seq_cst_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4, !mmra !0
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @flat_unordered_store(
+; GFX7-LABEL: flat_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_unordered_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @flat_monotonic_store(
+; GFX7-LABEL: flat_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_monotonic_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @flat_release_store(
+; GFX7-LABEL: flat_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_release_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") release, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @flat_seq_cst_store(
+; GFX7-LABEL: flat_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_seq_cst_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: v_mov_b32_e32 v2, s2
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr %out) {
+entry:
+ store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @flat_non_atomic_load_store(
+; GFX7-LABEL: flat_non_atomic_load_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_non_atomic_load_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_non_atomic_load_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_non_atomic_load_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_non_atomic_load_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_non_atomic_load_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_non_atomic_load_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load i32, ptr %in, align 4, !mmra !0
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @global_unordered_load(
+; GFX7-LABEL: global_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_unordered_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4, !mmra !0
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @global_monotonic_load(
+; GFX7-LABEL: global_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_monotonic_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4, !mmra !0
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @global_acquire_load(
+; GFX7-LABEL: global_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_acquire_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4, !mmra !0
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @global_seq_cst_load(
+; GFX7-LABEL: global_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_seq_cst_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4, !mmra !0
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @global_unordered_store(
+; GFX7-LABEL: global_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_unordered_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(1) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @global_monotonic_store(
+; GFX7-LABEL: global_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_monotonic_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(1) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @global_release_store(
+; GFX7-LABEL: global_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_release_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(1) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @global_seq_cst_store(
+; GFX7-LABEL: global_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_clause 0x1
+; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_clause 0x1
+; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_seq_cst_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s2
+; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(1) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @global_non_atomic_load_store(
+; GFX7-LABEL: global_non_atomic_load_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_non_atomic_load_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_non_atomic_load_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_non_atomic_load_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_non_atomic_load_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_non_atomic_load_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_non_atomic_load_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load i32, ptr addrspace(1) %in, align 4, !mmra !0
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_unordered_load(
+; GFX7-LABEL: local_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: ds_read_b32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: ds_read_b32 v0, v0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: ds_write_b32 v1, v0
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: ds_read_b32 v0, v0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: ds_write_b32 v1, v0
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_unordered_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: ds_write_b32 v1, v0
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") unordered, align 4, !mmra !0
+ store i32 %val, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_monotonic_load(
+; GFX7-LABEL: local_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: ds_read_b32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: ds_read_b32 v0, v0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: ds_write_b32 v1, v0
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: ds_read_b32 v0, v0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: ds_write_b32 v1, v0
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_monotonic_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: ds_write_b32 v1, v0
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") monotonic, align 4, !mmra !0
+ store i32 %val, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_acquire_load(
+; GFX7-LABEL: local_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: ds_read_b32 v0, v0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: ds_read_b32 v0, v0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: ds_write_b32 v1, v0
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: ds_read_b32 v0, v0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: ds_write_b32 v1, v0
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_acquire_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: ds_write_b32 v1, v0
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") acquire, align 4, !mmra !0
+ store i32 %val, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_seq_cst_load(
+; GFX7-LABEL: local_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: ds_read_b32 v0, v0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: ds_read_b32 v0, v0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: ds_write_b32 v1, v0
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: ds_read_b32 v0, v0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: ds_write_b32 v1, v0
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_seq_cst_load:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: ds_write_b32 v1, v0
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4, !mmra !0
+ store i32 %val, ptr addrspace(3) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_unordered_store(
+; GFX7-LABEL: local_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s1
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: ds_write_b32 v0, v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT: ds_write_b32 v0, v1
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT: ds_write_b32 v0, v1
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_unordered_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s1
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: ds_write_b32 v0, v1
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(3) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") unordered, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @local_monotonic_store(
+; GFX7-LABEL: local_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s1
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: ds_write_b32 v0, v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT: ds_write_b32 v0, v1
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT: ds_write_b32 v0, v1
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_monotonic_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s1
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: ds_write_b32 v0, v1
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(3) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @local_release_store(
+; GFX7-LABEL: local_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s1
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: ds_write_b32 v0, v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT: ds_write_b32 v0, v1
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT: ds_write_b32 v0, v1
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_release_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s1
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: ds_write_b32 v0, v1
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(3) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @local_seq_cst_store(
+; GFX7-LABEL: local_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s1
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: ds_write_b32 v0, v1
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT: ds_write_b32 v0, v1
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT: ds_write_b32 v0, v1
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_seq_cst_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s1
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: ds_write_b32 v0, v1
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ i32 %in, ptr addrspace(3) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4, !mmra !0
+ ret void
+}
+
+define amdgpu_kernel void @local_non_atomic_load_store(
+; GFX7-LABEL: local_non_atomic_load_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: ds_read_b32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_non_atomic_load_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: ds_read_b32 v0, v0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: ds_write_b32 v1, v0
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_non_atomic_load_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: ds_read_b32 v0, v0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: ds_write_b32 v1, v0
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_non_atomic_load_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: ds_write_b32 v1, v0
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_non_atomic_load_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_non_atomic_load_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_non_atomic_load_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+ %val = load i32, ptr addrspace(3) %in, align 4, !mmra !0
+ store i32 %val, ptr addrspace(3) %out
+ ret void
+}
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"amdgpu.vulkan.memory-model", i32 1}
+
+!0 = !{!"vulkan", !"nonprivate"}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-private.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-private.ll
new file mode 100644
index 0000000000000..b21012a2609cf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-mmra-vulkan-private.ll
@@ -0,0 +1,414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s
+
+; TODO: Atomics aren't tested because vulkan:private is not allowed on atomics, so tests are a bit restricted so far.
+
+define amdgpu_kernel void @flat_non_atomic_load_store(
+; GFX7-LABEL: flat_non_atomic_load_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: flat_non_atomic_load_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: flat_non_atomic_load_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: flat_non_atomic_load_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: flat_load_dword v2, v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 v0, s2
+; GFX900-NEXT: v_mov_b32_e32 v1, s3
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_store_dword v[0:1], v2
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: flat_non_atomic_load_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_non_atomic_load_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_non_atomic_load_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: flat_non_atomic_load_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr %in, ptr %out) {
+entry:
+ %val = load i32, ptr %in, align 4, !mmra !0
+ store i32 %val, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @global_non_atomic_load_store(
+; GFX7-LABEL: global_non_atomic_load_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: flat_store_dword v[0:1], v2
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: global_non_atomic_load_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: global_non_atomic_load_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: global_non_atomic_load_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s0
+; GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: global_non_atomic_load_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_non_atomic_load_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_nop 0
+; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_non_atomic_load_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_nop 0
+; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: global_non_atomic_load_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load i32, ptr addrspace(1) %in, align 4, !mmra !0
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_non_atomic_load_store(
+; GFX7-LABEL: local_non_atomic_load_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: ds_read_b32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: local_non_atomic_load_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-WGP-NEXT: ds_read_b32 v0, v0
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: ds_write_b32 v1, v0
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: local_non_atomic_load_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-CU-NEXT: ds_read_b32 v0, v0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: ds_write_b32 v1, v0
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX900-LABEL: local_non_atomic_load_store:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s0
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: v_mov_b32_e32 v1, s1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: ds_write_b32 v1, v0
+; GFX900-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: local_non_atomic_load_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_non_atomic_load_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_non_atomic_load_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
+; GFX90A-NOTTGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+; GFX90A-TGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+; GFX940-NOTTGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+; GFX940-TGSPLIT-LABEL: local_non_atomic_load_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
+ ptr addrspace(3) %in, ptr addrspace(3) %out) {
+entry:
+ %val = load i32, ptr addrspace(3) %in, align 4, !mmra !0
+ store i32 %val, ptr addrspace(3) %out
+ ret void
+}
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"amdgpu.vulkan.memory-model", i32 1}
+
+!0 = !{!"vulkan", !"private"}
More information about the llvm-commits
mailing list