[llvm] r334241 - [AMDGPU] Simplify memory legalizer

Tony Tye via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 7 15:28:32 PDT 2018


Author: t-tye
Date: Thu Jun  7 15:28:32 2018
New Revision: 334241

URL: http://llvm.org/viewvc/llvm-project?rev=334241&view=rev
Log:
[AMDGPU] Simplify memory legalizer

- Make code easier to maintain.
- Avoid generating waitcnts for VMEM if the address sppace does not involve VMEM.
- Add support to generate waitcnts for LDS and GDS memory.

Differential Revision: https://reviews.llvm.org/D47504


Added:
    llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
    llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir
    llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
    llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
    llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll
    llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir

Modified: llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp?rev=334241&r1=334240&r2=334241&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMemoryLegalizer.cpp Thu Jun  7 15:28:32 2018
@@ -21,6 +21,7 @@
 #include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -37,6 +38,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <list>
 
@@ -48,42 +50,142 @@ using namespace llvm::AMDGPU;
 
 namespace {
 
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+/// Memory operation flags. Can be ORed together.
+enum class SIMemOp {
+  NONE = 0u,
+  LOAD = 1u << 0,
+  STORE = 1u << 1,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
+};
+
+/// Position to insert a new instruction relative to an existing
+/// instruction.
+enum class Position {
+  BEFORE,
+  AFTER
+};
+
+/// The atomic synchronization scopes supported by the AMDGPU target.
+enum class SIAtomicScope {
+  NONE,
+  SINGLETHREAD,
+  WAVEFRONT,
+  WORKGROUP,
+  AGENT,
+  SYSTEM
+};
+
+/// The distinct address spaces supported by the AMDGPU target for
+/// atomic memory operation. Can be ORed toether.
+enum class SIAtomicAddrSpace {
+  NONE = 0u,
+  GLOBAL = 1u << 0,
+  LDS = 1u << 1,
+  SCRATCH = 1u << 2,
+  GDS = 1u << 3,
+  OTHER = 1u << 4,
+
+  /// The address spaces that can be accessed by a FLAT instruction.
+  FLAT = GLOBAL | LDS | SCRATCH,
+
+  /// The address spaces that support atomic instructions.
+  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
+
+  /// All address spaces.
+  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
+};
+
+/// Sets named bit \p BitName to "true" if present in instruction \p MI.
+/// \returns Returns true if \p MI is modified, false otherwise.
+template <uint16_t BitName>
+bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
+  int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+  if (BitIdx == -1)
+    return false;
+
+  MachineOperand &Bit = MI->getOperand(BitIdx);
+  if (Bit.getImm() != 0)
+    return false;
+
+  Bit.setImm(1);
+  return true;
+}
+
 class SIMemOpInfo final {
 private:
-  SyncScope::ID SSID = SyncScope::System;
+
+  friend class SIMemOpAccess;
+
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
   bool IsNonTemporal = false;
 
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
-      : SSID(SSID), Ordering(Ordering) {}
-
-  SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
-              AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
-      : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
-        IsNonTemporal(IsNonTemporal) {}
-
-  /// \returns Info constructed from \p MI, which has at least machine memory
-  /// operand.
-  static Optional<SIMemOpInfo> constructFromMIWithMMO(
-      const MachineBasicBlock::iterator &MI);
+  SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
+              SIAtomicScope Scope = SIAtomicScope::SYSTEM,
+              SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
+              SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
+              bool IsCrossAddressSpaceOrdering = true,
+              AtomicOrdering FailureOrdering =
+                AtomicOrdering::SequentiallyConsistent,
+              bool IsNonTemporal = false)
+    : Ordering(Ordering), FailureOrdering(FailureOrdering),
+      Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
+      InstrAddrSpace(InstrAddrSpace),
+      IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
+      IsNonTemporal(IsNonTemporal) {
+    // There is also no cross address space ordering if the ordering
+    // address space is the same as the instruction address space and
+    // only contains a single address space.
+    if ((OrderingAddrSpace == InstrAddrSpace) &&
+        isPowerOf2_32(uint32_t(InstrAddrSpace)))
+      IsCrossAddressSpaceOrdering = false;
+  }
 
 public:
-  /// \returns Synchronization scope ID of the machine instruction used to
+  /// \returns Atomic synchronization scope of the machine instruction used to
   /// create this SIMemOpInfo.
-  SyncScope::ID getSSID() const {
-    return SSID;
+  SIAtomicScope getScope() const {
+    return Scope;
   }
+
   /// \returns Ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getOrdering() const {
     return Ordering;
   }
+
   /// \returns Failure ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo.
   AtomicOrdering getFailureOrdering() const {
     return FailureOrdering;
   }
+
+  /// \returns The address spaces be accessed by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getInstrAddrSpace() const {
+    return InstrAddrSpace;
+  }
+
+  /// \returns The address spaces that must be ordered by the machine
+  /// instruction used to create this SiMemOpInfo.
+  SIAtomicAddrSpace getOrderingAddrSpace() const {
+    return OrderingAddrSpace;
+  }
+
+  /// \returns Return true iff memory ordering of operations on
+  /// different address spaces is required.
+  bool getIsCrossAddressSpaceOrdering() const {
+    return IsCrossAddressSpaceOrdering;
+  }
+
   /// \returns True if memory access of the machine instruction used to
   /// create this SIMemOpInfo is non-temporal, false otherwise.
   bool isNonTemporal() const {
@@ -96,59 +198,111 @@ public:
     return Ordering != AtomicOrdering::NotAtomic;
   }
 
+};
+
+class SIMemOpAccess final {
+private:
+
+  AMDGPUAS SIAddrSpaceInfo;
+  AMDGPUMachineModuleInfo *MMI = nullptr;
+
+  /// Reports unsupported message \p Msg for \p MI to LLVM context.
+  void reportUnsupported(const MachineBasicBlock::iterator &MI,
+                         const char *Msg) const;
+
+  /// Inspects the target synchonization scope \p SSID and determines
+  /// the SI atomic scope it corresponds to, the address spaces it
+  /// covers, and whether the memory ordering applies between address
+  /// spaces.
+  Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+
+  /// \return Return a bit set of the address spaces accessed by \p AS.
+  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
+
+  /// \returns Info constructed from \p MI, which has at least machine memory
+  /// operand.
+  Optional<SIMemOpInfo> constructFromMIWithMMO(
+      const MachineBasicBlock::iterator &MI) const;
+
+public:
+  /// Construct class to support accessing the machine memory operands
+  /// of instructions in the machine function \p MF.
+  SIMemOpAccess(MachineFunction &MF);
+
   /// \returns Load info if \p MI is a load operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getLoadInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getLoadInfo(
+      const MachineBasicBlock::iterator &MI) const;
+
   /// \returns Store info if \p MI is a store operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getStoreInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getStoreInfo(
+      const MachineBasicBlock::iterator &MI) const;
+
   /// \returns Atomic fence info if \p MI is an atomic fence operation,
   /// "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicFenceInfo(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getAtomicFenceInfo(
+      const MachineBasicBlock::iterator &MI) const;
+
   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
   /// rmw operation, "None" otherwise.
-  static Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
-      const MachineBasicBlock::iterator &MI);
-
-  /// Reports unknown synchronization scope used in \p MI to LLVM
-  /// context.
-  static void reportUnknownSyncScope(
-      const MachineBasicBlock::iterator &MI);
+  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
+      const MachineBasicBlock::iterator &MI) const;
 };
 
-class SIMemoryLegalizer final : public MachineFunctionPass {
-private:
-  /// Machine module info.
-  const AMDGPUMachineModuleInfo *MMI = nullptr;
+class SICacheControl {
+protected:
 
   /// Instruction info.
   const SIInstrInfo *TII = nullptr;
 
-  /// Immediate for "vmcnt(0)".
-  unsigned Vmcnt0Immediate = 0;
+  IsaInfo::IsaVersion IV;
 
-  /// Opcode for cache invalidation instruction (L1).
-  unsigned VmemSIMDCacheInvalidateOpc = 0;
+  SICacheControl(const SISubtarget &ST);
 
-  /// List of atomic pseudo instructions.
-  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+public:
 
-  /// Sets named bit (BitName) to "true" if present in \p MI. Returns
-  /// true if \p MI is modified, false otherwise.
-  template <uint16_t BitName>
-  bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
-    int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
-    if (BitIdx == -1)
-      return false;
-
-    MachineOperand &Bit = MI->getOperand(BitIdx);
-    if (Bit.getImm() != 0)
-      return false;
+  /// Create a cache control for the subtarget \p ST.
+  static std::unique_ptr<SICacheControl> create(const SISubtarget &ST);
 
-    Bit.setImm(1);
-    return true;
-  }
+  /// Update \p MI memory load instruction to bypass any caches up to
+  /// the \p Scope memory scope for address spaces \p
+  /// AddrSpace. Return true iff the instruction was modified.
+  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace) const = 0;
+
+  /// Update \p MI memory instruction to indicate it is
+  /// nontemporal. Return true iff the instruction was modified.
+  virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
+    const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure any caches associated with
+  /// address spaces \p AddrSpace for memory scopes up to memory scope
+  /// \p Scope are invalidated. Returns true iff any instructions
+  /// inserted.
+  virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                     SIAtomicScope Scope,
+                                     SIAtomicAddrSpace AddrSpace,
+                                     Position Pos) const = 0;
+
+  /// Inserts any necessary instructions at position \p Pos relative
+  /// to instruction \p MI to ensure memory instructions of kind \p Op
+  /// associated with address spaces \p AddrSpace have completed as
+  /// observed by other memory instructions executing in memory scope
+  /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
+  /// ordering is between address spaces. Returns true iff any
+  /// instructions inserted.
+  virtual bool insertWait(MachineBasicBlock::iterator &MI,
+                          SIAtomicScope Scope,
+                          SIAtomicAddrSpace AddrSpace,
+                          SIMemOp Op,
+                          bool IsCrossAddrSpaceOrdering,
+                          Position Pos) const = 0;
+};
+
+class SIGfx6CacheControl : public SICacheControl {
+protected:
 
   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
   /// is modified, false otherwise.
@@ -162,14 +316,55 @@ private:
     return enableNamedBit<AMDGPU::OpName::slc>(MI);
   }
 
-  /// Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertVmemSIMDCacheInvalidate(MachineBasicBlock::iterator &MI,
-                                     bool Before = true) const;
-  /// Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI.
-  /// Always returns true.
-  bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                           bool Before = true) const;
+public:
+
+  SIGfx6CacheControl(const SISubtarget &ST) : SICacheControl(ST) {};
+
+  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace) const override;
+
+  bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+  bool insertWait(MachineBasicBlock::iterator &MI,
+                  SIAtomicScope Scope,
+                  SIAtomicAddrSpace AddrSpace,
+                  SIMemOp Op,
+                  bool IsCrossAddrSpaceOrdering,
+                  Position Pos) const override;
+};
+
+class SIGfx7CacheControl : public SIGfx6CacheControl {
+public:
+
+  SIGfx7CacheControl(const SISubtarget &ST) : SIGfx6CacheControl(ST) {};
+
+  bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                             SIAtomicScope Scope,
+                             SIAtomicAddrSpace AddrSpace,
+                             Position Pos) const override;
+
+};
+
+class SIMemoryLegalizer final : public MachineFunctionPass {
+private:
+
+  /// Cache Control.
+  std::unique_ptr<SICacheControl> CC = nullptr;
+
+  /// List of atomic pseudo instructions.
+  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
+
+  /// Return true iff instruction \p MI is a atomic instruction that
+  /// returns a result.
+  bool isAtomicRet(const MachineInstr &MI) const {
+    return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
+  }
 
   /// Removes all processed atomic pseudo instructions from the current
   /// function. Returns true if current function is modified, false otherwise.
@@ -211,48 +406,129 @@ public:
 
 } // end namespace anonymous
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
-    const MachineBasicBlock::iterator &MI) {
-  assert(MI->getNumMemOperands() > 0);
+void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
+                                      const char *Msg) const {
+  const Function &Func = MI->getParent()->getParent()->getFunction();
+  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
+  Func.getContext().diagnose(Diag);
+}
+
+Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
+SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
+                               SIAtomicAddrSpace InstrScope) const {
+  /// TODO: For now assume OpenCL memory model which treats each
+  /// address space as having a separate happens-before relation, and
+  /// so an instruction only has ordering with respect to the address
+  /// space it accesses, and if it accesses multiple address spaces it
+  /// does not require ordering of operations in different address
+  /// spaces.
+ if (SSID == SyncScope::System)
+    return std::make_tuple(SIAtomicScope::SYSTEM,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getAgentSSID())
+    return std::make_tuple(SIAtomicScope::AGENT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWorkgroupSSID())
+    return std::make_tuple(SIAtomicScope::WORKGROUP,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == MMI->getWavefrontSSID())
+    return std::make_tuple(SIAtomicScope::WAVEFRONT,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  if (SSID == SyncScope::SingleThread)
+    return std::make_tuple(SIAtomicScope::SINGLETHREAD,
+                           SIAtomicAddrSpace::ATOMIC & InstrScope,
+                           false);
+  /// TODO: To support HSA Memory Model need to add additional memory
+  /// scopes that specify that do require cross address space
+  /// ordering.
+  return None;
+}
+
+SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
+  if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+    return SIAtomicAddrSpace::FLAT;
+  if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+    return SIAtomicAddrSpace::GLOBAL;
+  if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+    return SIAtomicAddrSpace::LDS;
+  if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+    return SIAtomicAddrSpace::SCRATCH;
+  if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+    return SIAtomicAddrSpace::GDS;
+
+  return SIAtomicAddrSpace::OTHER;
+}
+
+SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
+  SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
+  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+}
 
-  const MachineFunction *MF = MI->getParent()->getParent();
-  const AMDGPUMachineModuleInfo *MMI =
-      &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
+Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->getNumMemOperands() > 0);
 
   SyncScope::ID SSID = SyncScope::SingleThread;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
   bool IsNonTemporal = true;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
   for (const auto &MMO : MI->memoperands()) {
-    const auto &IsSyncScopeInclusion =
-        MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
-    if (!IsSyncScopeInclusion) {
-      reportUnknownSyncScope(MI);
-      return None;
-    }
-
-    SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
-    Ordering =
-        isStrongerThan(Ordering, MMO->getOrdering()) ?
-            Ordering : MMO->getOrdering();
-    FailureOrdering =
-        isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
-            FailureOrdering : MMO->getFailureOrdering();
+    IsNonTemporal &= MMO->isNonTemporal();
+    InstrAddrSpace |=
+      toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
+    AtomicOrdering OpOrdering = MMO->getOrdering();
+    if (OpOrdering != AtomicOrdering::NotAtomic) {
+      const auto &IsSyncScopeInclusion =
+          MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
+      if (!IsSyncScopeInclusion) {
+        reportUnsupported(MI,
+          "Unsupported non-inclusive atomic synchronization scope");
+        return None;
+      }
 
-    if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
-      IsNonTemporal = false;
+      SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
+      Ordering =
+          isStrongerThan(Ordering, OpOrdering) ?
+              Ordering : MMO->getOrdering();
+      assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
+             MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
+      FailureOrdering =
+          isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
+              FailureOrdering : MMO->getFailureOrdering();
+    }
   }
 
-  return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  if (Ordering != AtomicOrdering::NotAtomic) {
+    auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
+    if (!ScopeOrNone) {
+      reportUnsupported(MI, "Unsupported atomic synchronization scope");
+      return None;
+    }
+    std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+      ScopeOrNone.getValue();
+    if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+        ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+      reportUnsupported(MI, "Unsupported atomic address space");
+      return None;
+    }
+  }
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
+                     IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(MI->mayLoad() && !MI->mayStore()))
@@ -260,15 +536,13 @@ Optional<SIMemOpInfo> SIMemOpInfo::getLo
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(!MI->mayLoad() && MI->mayStore()))
@@ -276,30 +550,46 @@ Optional<SIMemOpInfo> SIMemOpInfo::getSt
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
     return None;
 
-  SyncScope::ID SSID =
-      static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
   AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
-  return SIMemOpInfo(SSID, Ordering);
+    static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
+
+  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
+  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
+  if (!ScopeOrNone) {
+    reportUnsupported(MI, "Unsupported atomic synchronization scope");
+    return None;
+  }
+
+  SIAtomicScope Scope = SIAtomicScope::NONE;
+  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
+  bool IsCrossAddressSpaceOrdering = false;
+  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
+    ScopeOrNone.getValue();
+
+  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
+      ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+    reportUnsupported(MI, "Unsupported atomic address space");
+    return None;
+  }
+
+  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
+                     IsCrossAddressSpaceOrdering);
 }
 
-/* static */
-Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgOrRmwInfo(
-    const MachineBasicBlock::iterator &MI) {
+Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
+    const MachineBasicBlock::iterator &MI) const {
   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 
   if (!(MI->mayLoad() && MI->mayStore()))
@@ -307,52 +597,251 @@ Optional<SIMemOpInfo> SIMemOpInfo::getAt
 
   // Be conservative if there are no memory operands.
   if (MI->getNumMemOperands() == 0)
-    return SIMemOpInfo(SyncScope::System,
-                       AtomicOrdering::SequentiallyConsistent,
-                       AtomicOrdering::SequentiallyConsistent);
+    return SIMemOpInfo();
 
-  return SIMemOpInfo::constructFromMIWithMMO(MI);
+  return constructFromMIWithMMO(MI);
+}
+
+SICacheControl::SICacheControl(const SISubtarget &ST) {
+  TII = ST.getInstrInfo();
+  IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
 }
 
 /* static */
-void SIMemOpInfo::reportUnknownSyncScope(
-    const MachineBasicBlock::iterator &MI) {
-  DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(),
-                                 "Unsupported synchronization scope");
-  LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext();
-  CTX->diagnose(Diag);
+std::unique_ptr<SICacheControl> SICacheControl::create(const SISubtarget &ST) {
+  AMDGPUSubtarget::Generation Generation = ST.getGeneration();
+  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return make_unique<SIGfx6CacheControl>(ST);
+  return make_unique<SIGfx7CacheControl>(ST);
+}
+
+bool SIGfx6CacheControl::enableLoadCacheBypass(
+    const MachineBasicBlock::iterator &MI,
+    SIAtomicScope Scope,
+    SIAtomicAddrSpace AddrSpace) const {
+  assert(MI->mayLoad() && !MI->mayStore());
+  bool Changed = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    /// TODO: Do not set glc for rmw atomic operations as they
+    /// implicitly bypass the L1 cache.
+
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      Changed |= enableGLCBit(MI);
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to bypass.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory caches
+  /// to be bypassed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  return Changed;
+}
+
+bool SIGfx6CacheControl::enableNonTemporal(
+    const MachineBasicBlock::iterator &MI) const {
+  assert(MI->mayLoad() ^ MI->mayStore());
+  bool Changed = false;
+
+  /// TODO: Do not enableGLCBit if rmw atomic.
+  Changed |= enableGLCBit(MI);
+  Changed |= enableSLCBit(MI);
+
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertVmemSIMDCacheInvalidate(
-  MachineBasicBlock::iterator &MI, bool Before) const {
+bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(VmemSIMDCacheInvalidateOpc));
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
 
-  if (!Before)
+  /// Other address spaces do not hava a cache.
+
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
 }
 
-bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
-                                            bool Before) const {
+bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+                                    SIAtomicScope Scope,
+                                    SIAtomicAddrSpace AddrSpace,
+                                    SIMemOp Op,
+                                    bool IsCrossAddrSpaceOrdering,
+                                    Position Pos) const {
+  bool Changed = false;
+
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (!Before)
+  if (Pos == Position::AFTER)
     ++MI;
 
-  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate);
+  bool VMCnt = false;
+  bool LGKMCnt = false;
+  bool EXPCnt = false;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      VMCnt = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The L1 cache keeps all memory operations in order for
+      // wavesfronts in the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
 
-  if (!Before)
+  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+    case SIAtomicScope::WORKGROUP:
+      // If no cross address space ordering then an LDS waitcnt is not
+      // needed as LDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/GDS memory as LDS operations
+      // could be reordered with respect to later global/GDS memory
+      // operations of the same wave.
+      LGKMCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The LDS keeps all memory operations in order for
+      // the same wavesfront.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      // If no cross address space ordering then an GDS waitcnt is not
+      // needed as GDS operations for all waves are executed in a
+      // total global ordering as observed by all waves. Required if
+      // also synchronizing with global/LDS memory as GDS operations
+      // could be reordered with respect to later global/LDS memory
+      // operations of the same wave.
+      EXPCnt = IsCrossAddrSpaceOrdering;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // The GDS keeps all memory operations in order for
+      // the same work-group.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  if (VMCnt || LGKMCnt || EXPCnt) {
+    unsigned WaitCntImmediate =
+      AMDGPU::encodeWaitcnt(IV,
+                            VMCnt ? 0 : getVmcntBitMask(IV),
+                            EXPCnt ? 0 : getExpcntBitMask(IV),
+                            LGKMCnt ? 0 : getLgkmcntBitMask(IV));
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+    Changed = true;
+  }
+
+  if (Pos == Position::AFTER)
     --MI;
 
-  return true;
+  return Changed;
+}
+
+bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
+                                               SIAtomicScope Scope,
+                                               SIAtomicAddrSpace AddrSpace,
+                                               Position Pos) const {
+  bool Changed = false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Pos == Position::AFTER)
+    ++MI;
+
+  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+    switch (Scope) {
+    case SIAtomicScope::SYSTEM:
+    case SIAtomicScope::AGENT:
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+      Changed = true;
+      break;
+    case SIAtomicScope::WORKGROUP:
+    case SIAtomicScope::WAVEFRONT:
+    case SIAtomicScope::SINGLETHREAD:
+      // No cache to invalidate.
+      break;
+    default:
+      llvm_unreachable("Unsupported synchronization scope");
+    }
+  }
+
+  /// The scratch address space does not need the global memory cache
+  /// to be flushed as all memory operations by the same thread are
+  /// sequentially consistent, and no other thread can access scratch
+  /// memory.
+
+  /// Other address spaces do not hava a cache.
+
+  if (Pos == Position::AFTER)
+    --MI;
+
+  return Changed;
 }
 
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
@@ -373,38 +862,38 @@ bool SIMemoryLegalizer::expandLoad(const
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
-          MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= enableGLCBit(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertVmemSIMDCacheInvalidate(MI, false);
-      }
-
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
+        MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace());
     }
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getInstrAddrSpace(),
+                                SIMemOp::LOAD,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -418,28 +907,20 @@ bool SIMemoryLegalizer::expandStore(cons
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
 
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   // Atomic instructions do not have the nontemporal attribute.
   if (MOI.isNonTemporal()) {
-    Changed |= enableGLCBit(MI);
-    Changed |= enableSLCBit(MI);
+    Changed |= CC->enableNonTemporal(MI);
     return Changed;
   }
 
@@ -450,34 +931,35 @@ bool SIMemoryLegalizer::expandAtomicFenc
                                           MachineBasicBlock::iterator &MI) {
   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
 
+  AtomicPseudoMIs.push_back(MI);
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertVmemSIMDCacheInvalidate(MI);
-
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
-
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      AtomicPseudoMIs.push_back(MI);
-      return Changed;
-    }
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      /// TODO: This relies on a barrier always generating a waitcnt
+      /// for LDS to ensure it is not reordered with the completion of
+      /// the proceeding LDS operations. If barrier had a memory
+      /// ordering and memory scope, then library does not need to
+      /// generate a fence. Could add support in this file for
+      /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
+      /// adding waitcnt before a S_BARRIER.
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::BEFORE);
 
-    SIMemOpInfo::reportUnknownSyncScope(MI);
+    return Changed;
   }
 
   return Changed;
@@ -490,34 +972,33 @@ bool SIMemoryLegalizer::expandAtomicCmpx
   bool Changed = false;
 
   if (MOI.isAtomic()) {
-    if (MOI.getSSID() == SyncScope::System ||
-        MOI.getSSID() == MMI->getAgentSSID()) {
-      if (MOI.getOrdering() == AtomicOrdering::Release ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= insertWaitcntVmcnt0(MI);
-
-      if (MOI.getOrdering() == AtomicOrdering::Acquire ||
-          MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
-          MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
-          MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
-          MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
-        Changed |= insertWaitcntVmcnt0(MI, false);
-        Changed |= insertVmemSIMDCacheInvalidate(MI, false);
-      }
-
-      return Changed;
+    if (MOI.getOrdering() == AtomicOrdering::Release ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE);
+
+    if (MOI.getOrdering() == AtomicOrdering::Acquire ||
+        MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
+        MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
+        MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
+        MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
+      Changed |= CC->insertWait(MI, MOI.getScope(),
+                                MOI.getOrderingAddrSpace(),
+                                isAtomicRet(*MI) ? SIMemOp::LOAD :
+                                                   SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::AFTER);
+      Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
+                                           MOI.getOrderingAddrSpace(),
+                                           Position::AFTER);
     }
 
-    if (MOI.getSSID() == SyncScope::SingleThread ||
-        MOI.getSSID() == MMI->getWorkgroupSSID() ||
-        MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= enableGLCBit(MI);
-      return Changed;
-    }
-
-    llvm_unreachable("Unsupported synchronization scope");
+    return Changed;
   }
 
   return Changed;
@@ -525,30 +1006,22 @@ bool SIMemoryLegalizer::expandAtomicCmpx
 
 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
-
-  MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
-  TII = ST.getInstrInfo();
 
-  Vmcnt0Immediate =
-      AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
-  VmemSIMDCacheInvalidateOpc =
-     ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ?
-       AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL;
+  SIMemOpAccess MOA(MF);
+  CC = SICacheControl::create(MF.getSubtarget<SISubtarget>());
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
-      if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI))
+      if (const auto &MOI = MOA.getLoadInfo(MI))
         Changed |= expandLoad(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI))
+      else if (const auto &MOI = MOA.getStoreInfo(MI))
         Changed |= expandStore(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI))
+      else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
         Changed |= expandAtomicFence(MOI.getValue(), MI);
-      else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgOrRmwInfo(MI))
+      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
     }
   }

Added: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir?rev=334241&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir Thu Jun  7 15:28:32 2018
@@ -0,0 +1,69 @@
+# RUN: not llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - 2>&1 | FileCheck -check-prefix=GCN %s
+
+---
+
+# GCN: error: <unknown>:0:0: in function invalid_load void (): Unsupported atomic address space
+
+name:            invalid_load
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
+    renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load seq_cst 4 from `i32 addrspace(42)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN: error: <unknown>:0:0: in function invalid_store void (): Unsupported atomic address space
+
+name:            invalid_store
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(42)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN: error: <unknown>:0:0: in function invalid_cmpxchg void (): Unsupported atomic address space
+
+name:            invalid_cmpxchg
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    $vgpr3 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit-def $vgpr2_vgpr3, implicit $sgpr0_sgpr1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN: error: <unknown>:0:0: in function invalid_rmw void (): Unsupported atomic address space
+
+name:            invalid_rmw
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec
+    $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront") seq_cst 4 on `i32 addrspace(42)* undef`)
+    S_ENDPGM
+
+...

Modified: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll?rev=334241&r1=334240&r2=334241&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll Thu Jun  7 15:28:32 2018
@@ -1,14 +1,14 @@
 ; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
 ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
-; CHECK: error: <unknown>:0:0: in function invalid_fence void (): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_fence void (): Unsupported atomic synchronization scope
 define amdgpu_kernel void @invalid_fence() {
 entry:
   fence syncscope("invalid") seq_cst
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_load void (i32*, i32*): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_load void (i32*, i32*): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_load(
     i32* %in, i32* %out) {
 entry:
@@ -17,7 +17,7 @@ entry:
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_store void (i32, i32*): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_store void (i32, i32*): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_store(
     i32 %in, i32* %out) {
 entry:
@@ -25,7 +25,7 @@ entry:
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_cmpxchg void (i32*, i32, i32): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_cmpxchg void (i32*, i32, i32): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_cmpxchg(
     i32* %out, i32 %in, i32 %old) {
 entry:
@@ -34,7 +34,7 @@ entry:
   ret void
 }
 
-; CHECK: error: <unknown>:0:0: in function invalid_rmw void (i32*, i32): Unsupported synchronization scope
+; CHECK: error: <unknown>:0:0: in function invalid_rmw void (i32*, i32): Unsupported non-inclusive atomic synchronization scope
 define amdgpu_kernel void @invalid_rmw(
     i32* %out, i32 %in) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll?rev=334241&r1=334240&r2=334241&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-load.ll Thu Jun  7 15:28:32 2018
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @system_unordered(
     i32* %in, i32* %out) {
@@ -23,7 +23,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @system_monotonic(
     i32* %in, i32* %out) {
@@ -65,7 +65,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_unordered(
     i32* %in, i32* %out) {
@@ -79,7 +79,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_monotonic(
     i32* %in, i32* %out) {
@@ -93,7 +93,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_acquire(
     i32* %in, i32* %out) {
@@ -107,7 +107,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @singlethread_seq_cst(
     i32* %in, i32* %out) {
@@ -121,7 +121,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @agent_unordered(
     i32* %in, i32* %out) {
@@ -135,7 +135,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @agent_monotonic(
     i32* %in, i32* %out) {
@@ -175,9 +175,9 @@ entry:
 
 ; GCN-LABEL: {{^}}workgroup_unordered:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
+; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @workgroup_unordered(
     i32* %in, i32* %out) {
@@ -191,7 +191,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @workgroup_monotonic(
     i32* %in, i32* %out) {
@@ -233,7 +233,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_unordered(
     i32* %in, i32* %out) {
@@ -247,7 +247,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_monotonic(
     i32* %in, i32* %out) {
@@ -261,7 +261,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_acquire(
     i32* %in, i32* %out) {
@@ -275,7 +275,7 @@ entry:
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
 ; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
 ; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
-; GCN-NOT:   buffer_wbinvl1_vol
+; GFX89-NOT: buffer_wbinvl1_vol
 ; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
 define amdgpu_kernel void @wavefront_seq_cst(
     i32* %in, i32* %out) {

Added: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir?rev=334241&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-local.mir Thu Jun  7 15:28:32 2018
@@ -0,0 +1,1054 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+
+# GCN-LABEL: name: load_singlethread_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_singlethread_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_singlethread_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_singlethread_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(3)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_acq_rel
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_acq_rel
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`)
+    S_ENDPGM
+
+...

Modified: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir?rev=334241&r1=334240&r2=334241&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir Thu Jun  7 15:28:32 2018
@@ -1,119 +1,19 @@
-# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s
 
---- |
-  ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
-  source_filename = "memory-legalizer-multiple-mem-operands.ll"
-  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
-
-  define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
-  entry:
-    %scratch0 = alloca [8192 x i32], addrspace(5)
-    %scratch1 = alloca [8192 x i32], addrspace(5)
-    %scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)*
-    store i32 1, i32 addrspace(5)* %scratchptr01
-    %scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)*
-    store i32 2, i32 addrspace(5)* %scratchptr12
-    %cmp = icmp eq i32 %cond, 0
-    br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
-
-  if:                                               ; preds = %entry
-    %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
-    %if_value = load atomic i32, i32 addrspace(5)* %if_ptr syncscope("workgroup") seq_cst, align 4
-    br label %done, !structurizecfg.uniform !0
-
-  else:                                             ; preds = %entry
-    %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
-    %else_value = load atomic i32, i32 addrspace(5)* %else_ptr syncscope("agent") unordered, align 4
-    br label %done, !structurizecfg.uniform !0
-
-  done:                                             ; preds = %else, %if
-    %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
-    store i32 %value, i32 addrspace(1)* %out
-    ret void
-  }
-
-  ; Function Attrs: convergent nounwind
-  declare { i1, i64 } @llvm.amdgcn.if(i1) #1
-
-  ; Function Attrs: convergent nounwind
-  declare { i1, i64 } @llvm.amdgcn.else(i64) #1
-
-  ; Function Attrs: convergent nounwind readnone
-  declare i64 @llvm.amdgcn.break(i64) #2
-
-  ; Function Attrs: convergent nounwind readnone
-  declare i64 @llvm.amdgcn.if.break(i1, i64) #2
-
-  ; Function Attrs: convergent nounwind readnone
-  declare i64 @llvm.amdgcn.else.break(i64, i64) #2
-
-  ; Function Attrs: convergent nounwind
-  declare i1 @llvm.amdgcn.loop(i64) #1
-
-  ; Function Attrs: convergent nounwind
-  declare void @llvm.amdgcn.end.cf(i64) #1
-
-  attributes #0 = { "target-cpu"="gfx803" }
-  attributes #1 = { convergent nounwind }
-  attributes #2 = { convergent nounwind readnone }
-
-  !0 = !{}
-
-...
 ---
 
-# CHECK-LABEL: name: multiple_mem_operands
+# GCN-LABEL: name: multiple_mem_operands
 
-# CHECK-LABEL: bb.3.done:
-# CHECK:       S_WAITCNT 3952
-# CHECK-NEXT:  BUFFER_LOAD_DWORD_OFFEN
-# CHECK-NEXT:  S_WAITCNT 3952
-# CHECK-NEXT:  BUFFER_WBINVL1_VOL
+# GCN-LABEL: bb.3:
+# GCN:       S_WAITCNT 3952
+# GCN-NEXT:  BUFFER_LOAD_DWORD_OFFEN
+# GCN-NEXT:  S_WAITCNT 3952
+# GCN-NEXT:  BUFFER_WBINVL1_VOL
 
 name:            multiple_mem_operands
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-liveins:
-  - { reg: '$sgpr0_sgpr1', virtual-reg: '' }
-  - { reg: '$sgpr3', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       65540
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:
-  - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: 0,
-      isImmutable: false, isAliased: false, callee-saved-register: '' }
-stack:
-  - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4,
-      stack-id: 0, callee-saved-register: '', local-offset: 0,
-      debug-info-variable: '', debug-info-expression: '',
-      debug-info-location: '' }
-  - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768,
-      alignment: 4, stack-id: 0, callee-saved-register: '', local-offset: 32768,
-      debug-info-variable: '', debug-info-expression: '',
-      debug-info-location: '' }
-constants:
 body:             |
   bb.0.entry:
-    successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
     liveins: $sgpr0_sgpr1, $sgpr3
 
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
@@ -123,43 +23,43 @@ body:             |
     $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
-    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01)
+    BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`)
     S_WAITCNT 127
     S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc
     S_WAITCNT 3855
     $vgpr0 = V_MOV_B32_e32 2, implicit $exec
     $vgpr1 = V_MOV_B32_e32 32772, implicit $exec
-    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12)
-    S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc
+    BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`)
+    S_CBRANCH_SCC0 %bb.1, implicit killed $scc
 
-  bb.2.else:
-    successors: %bb.3.done(0x80000000)
+  bb.2:
+    successors: %bb.3(0x80000000)
     liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
 
     $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
     S_WAITCNT 3855
     $vgpr0 = V_MOV_B32_e32 32772, implicit $exec
-    S_BRANCH %bb.3.done
+    S_BRANCH %bb.3
 
-  bb.1.if:
-    successors: %bb.3.done(0x80000000)
+  bb.1:
+    successors: %bb.3(0x80000000)
     liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
 
     $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
     S_WAITCNT 3855
     $vgpr0 = V_MOV_B32_e32 4, implicit $exec
 
-  bb.3.done:
+  bb.3:
     liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0
 
     S_WAITCNT 127
     $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc
     $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
-    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent") unordered 4 from %ir.else_ptr), (load syncscope("workgroup") seq_cst 4 from %ir.if_ptr)
+    $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`)
     $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5
     $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec
     S_WAITCNT 3952
-    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out)
+    FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`)
     S_ENDPGM
 
 ...

Added: llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir?rev=334241&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/memory-legalizer-region.mir Thu Jun  7 15:28:32 2018
@@ -0,0 +1,1054 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+
+# GCN-LABEL: name: load_singlethread_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_singlethread_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_singlethread_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_singlethread_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_singlethread_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_wavefront_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_wavefront_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_workgroup_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_workgroup_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_agent_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_agent_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: load_system_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_READ_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       FLAT_STORE_DWORD
+
+name:            load_system_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(4)* undef`)
+    $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec
+    FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_singlethread_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_singlethread_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_wavefront_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_wavefront_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_workgroup_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_workgroup_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_agent_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_agent_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: store_system_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRITE_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            store_system_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_unordered
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_unordered
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_monotonic
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_monotonic
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_acquire
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_acquire
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_release
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_release
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_acq_rel
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_acq_rel
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: atomicrmw_singlethread_seq_cst
+
+# GCN-LABEL: bb.0:
+# GCN-NOT:   S_WAITCNT
+# GCN:       DS_WRXCHG_RTN_B32
+# GCN-NOT:   S_WAITCNT
+# GCN:       S_ENDPGM
+
+name:            atomicrmw_singlethread_seq_cst
+body:             |
+  bb.0:
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+    $m0 = S_MOV_B32 -1
+    $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+    $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(4)* undef`)
+    S_ENDPGM
+
+...




More information about the llvm-commits mailing list