[llvm] d371417 - [AMDGPU] Enable volatile and non-temporal for loads to LDS (#153244)
    via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Mon Oct 20 10:42:26 PDT 2025
    
    
  
Author: Krzysztof Drewniak
Date: 2025-10-20T12:42:22-05:00
New Revision: d37141776f6b6d0116f4566d4ee56b29ecdc7d0e
URL: https://github.com/llvm/llvm-project/commit/d37141776f6b6d0116f4566d4ee56b29ecdc7d0e
DIFF: https://github.com/llvm/llvm-project/commit/d37141776f6b6d0116f4566d4ee56b29ecdc7d0e.diff
LOG: [AMDGPU] Enable volatile and non-temporal for loads to LDS (#153244)
The primary purpose of this commit is to enable marking loads to LDS
(global.load.lds, buffer.*.load.lds) volatile (using bit 31 of the aux
as with normal buffer loads) and to ensure that their !nontemporal
annotations translate to appropriate settings of te cache control bits.
However, in the process of implementing this feature, we also fixed
- Incorrect handling of buffer loads to LDS in GlobalISel
- Updating the handling of volatile on buffers in SIMemoryLegalizer:
previously, the mapping of address spaces would cause volatile on buffer
loads to be silently dropped on at least gfx10.
---------
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
Added: 
    llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/SIDefines.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
    llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
    llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
Removed: 
    
################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e062032313058..f971bc0756544 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -6512,6 +6512,13 @@ operations.
 ``buffer/global/flat_load/store/atomic`` instructions to global memory are
 termed vector memory operations.
 
+``global_load_lds`` or ``buffer/global_load`` instructions with the `lds` flag
+are LDS DMA loads. They interact with caches as if the loaded data were
+being loaded to registers and not to LDS, and so therefore support the same
+cache modifiers. They cannot be performed atomically. They implement volatile
+(via aux/cpol bit 31) and nontemporal (via metadata) as if they were loads
+from the global address space.
+
 Private address space uses ``buffer_load/store`` using the scratch V#
 (GFX6-GFX8), or ``scratch_load/store`` (GFX9-GFX11). Since only a single thread
 is accessing the memory, atomic memory orderings are not meaningful, and all
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ded00b1274670..9e334d4316336 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2819,20 +2819,24 @@ class AMDGPULoadToLDS :
      "", [SDNPMemOperand]>;
 def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
 
-class AMDGPUGlobalLoadLDS :
-  ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
-  Intrinsic <
-    [],
-    [LLVMQualPointerType<1>,            // Base global pointer to load from
-     LLVMQualPointerType<3>,            // LDS base pointer to store to
-     llvm_i32_ty,                       // Data byte size: 1/2/4 (/12/16 for gfx950)
-     llvm_i32_ty,                       // imm offset (applied to both global and LDS address)
-     llvm_i32_ty],                      // auxiliary data (imm, cachepolicy (bit 0 = sc0,
-                                        //                                   bit 1 = sc1,
-                                        //                                   bit 4 = scc))
-    [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
-     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
-     "", [SDNPMemOperand]>;
+class AMDGPUGlobalLoadLDS
+    : ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
+      Intrinsic<
+          [],
+          [LLVMQualPointerType<1>, // Base global pointer to load from
+           LLVMQualPointerType<3>, // LDS base pointer to store to
+           llvm_i32_ty,            // Data byte size: 1/2/4 (/12/16 for gfx950)
+           llvm_i32_ty,  // imm offset (applied to both global and LDS address)
+           llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+                         //                                   bit 1 = sc1,
+                         //                                   bit 4 = scc,
+                         //                                   bit 31 = volatile
+                         //                                   (compiler
+                         //                                   implemented)))
+          [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+           ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
+           IntrNoCallback, IntrNoFree],
+          "", [SDNPMemOperand]>;
 def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
 
 // This is IntrHasSideEffects because it reads from a volatile hardware register.
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c7344426..97c2c9c5316b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3446,10 +3446,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
           : 0); // swz
 
   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+  // Don't set the offset value here because the pointer points to the base of
+  // the buffer.
   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-  LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+
   MachinePointerInfo StorePtrI = LoadPtrI;
-  StorePtrI.V = nullptr;
+  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+                                                 AMDGPUAS::BUFFER_RESOURCE));
+  LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
   auto F = LoadMMO->getFlags() &
@@ -3627,13 +3631,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   if (isSGPR(Addr))
     MIB.addReg(VOffset);
 
-  MIB.add(MI.getOperand(4))  // offset
-     .add(MI.getOperand(5)); // cpol
+  MIB.add(MI.getOperand(4)); // offset
+
+  unsigned Aux = MI.getOperand(5).getImm();
+  MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
 
   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
   LoadPtrI.Offset = MI.getOperand(4).getImm();
   MachinePointerInfo StorePtrI = LoadPtrI;
+  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+                                                 AMDGPUAS::GLOBAL_ADDRESS));
   LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
   auto F = LoadMMO->getFlags() &
diff  --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index ecc28244cc71e..b7a92a0a1d634 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -423,6 +423,9 @@ enum CPol {
   // Volatile (used to preserve/signal operation volatility for buffer
   // operations not a real instruction bit)
   VOLATILE = 1 << 31,
+  // The set of "cache policy" bits used for compiler features that
+  // do not correspond to handware features.
+  VIRTUAL_BITS = VOLATILE,
 };
 
 } // namespace CPol
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2841c114a698..a7574213c2907 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1651,6 +1651,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
     Info.ptrVal = CI.getArgOperand(1);
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+    auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+    if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+      Info.flags |= MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
@@ -11219,8 +11222,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     MachinePointerInfo StorePtrI = LoadPtrI;
     LoadPtrI.V = PoisonValue::get(
-        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
-    LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+        PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
+    LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
     auto F = LoadMMO->getFlags() &
@@ -11307,7 +11310,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
 
     Ops.push_back(Op.getOperand(5));  // Offset
-    Ops.push_back(Op.getOperand(6));  // CPol
+
+    unsigned Aux = Op.getConstantOperandVal(6);
+    Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
+                                        MVT::i32)); // CPol
+
     Ops.push_back(M0Val.getValue(0)); // Chain
     Ops.push_back(M0Val.getValue(1)); // Glue
 
diff  --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 362ef140a28f8..bdbc000524ce4 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/TargetParser/TargetParser.h"
 
@@ -277,6 +278,12 @@ class SIMemOpAccess final {
   /// rmw operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
+
+  /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
+  /// along with an indication of whether this is a load or store. If it is not
+  /// a direct-to-LDS operation, returns std::nullopt.
+  std::optional<SIMemOpInfo>
+  getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
 };
 
 class SICacheControl {
@@ -703,6 +710,9 @@ class SIMemoryLegalizer final {
   /// instructions are added/deleted or \p MI is modified, false otherwise.
   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                 MachineBasicBlock::iterator &MI);
+  /// Expands LDS DMA operation \p MI. Returns true if instructions are
+  /// added/deleted or \p MI is modified, false otherwise.
+  bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
 
 public:
   SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
@@ -832,6 +842,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
     return SIAtomicAddrSpace::SCRATCH;
   if (AS == AMDGPUAS::REGION_ADDRESS)
     return SIAtomicAddrSpace::GDS;
+  if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+      AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
+    return SIAtomicAddrSpace::GLOBAL;
 
   return SIAtomicAddrSpace::OTHER;
 }
@@ -987,6 +1000,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
   return constructFromMIWithMMO(MI);
 }
 
+std::optional<SIMemOpInfo>
+SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
+  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+  if (!SIInstrInfo::isLDSDMA(*MI))
+    return std::nullopt;
+
+  return constructFromMIWithMMO(MI);
+}
+
 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
   TII = ST.getInstrInfo();
   IV = getIsaVersion(ST.getCPU());
@@ -1099,7 +1122,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1429,7 +1452,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1733,7 +1756,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1968,7 +1991,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -2266,7 +2289,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -2611,7 +2634,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
 
   // Only handle load and store, not atomic read-modify-write instructions.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -2934,6 +2957,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   return Changed;
 }
 
+bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
+                                     MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoad() && MI->mayStore());
+
+  // The volatility or nontemporal-ness of the operation is a
+  // function of the global memory, not the LDS.
+  SIMemOp OpKind =
+      SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
+
+  // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
+  // stores. The operation is treated as a volatile/nontemporal store
+  // to its second argument.
+  return CC->enableVolatileAndOrNonTemporal(
+      MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
+      MOI.isNonTemporal(), MOI.isLastUse());
+}
+
 bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
   const MachineModuleInfo &MMI =
       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -2985,14 +3025,17 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
-      if (const auto &MOI = MOA.getLoadInfo(MI))
+      if (const auto &MOI = MOA.getLoadInfo(MI)) {
         Changed |= expandLoad(*MOI, MI);
-      else if (const auto &MOI = MOA.getStoreInfo(MI)) {
+      } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
         Changed |= expandStore(*MOI, MI);
-      } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
+      } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
+        Changed |= expandLDSDMA(*MOI, MI);
+      } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
         Changed |= expandAtomicFence(*MOI, MI);
-      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+      } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
+      }
     }
   }
 
diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 53b2542cf9a7e..142290a39f8f4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
-; SDAG-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
 ; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
-; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 5d03dfb56c8cc..352af044b0a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -218,3 +218,174 @@ main_body:
   ret void
 }
 
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_mov_b32 m0, s0
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    s_mov_b32 m0, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 m0, s0
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v0, s[0:1] glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v0, s[0:1] slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+!0 = !{i32 1}
diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
index f0204bd81470a..1dcd032e14c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
@@ -110,3 +110,43 @@ main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
   ret void
 }
+
+define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_volatile:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:256 lds
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:512 lds
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_nontemporal:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc slc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+!0 = !{i32 1}
diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index b5d741b99c582..a9799993f5cdc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p
 ; PREGFX10-LABEL: buffer_load_volatile:
 ; PREGFX10:       ; %bb.0: ; %main_body
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
 ; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: buffer_load_volatile:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: buffer_load_volatile:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
diff  --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 97db15ba637a5..1d1d3e4a68fee 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
diff  --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 9dac2393fd966..1e4b63327651e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
@@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
@@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_or_b64 s[8:9], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-SDAG-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x30
 ; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-SDAG-NEXT:    s_mov_b32 s5, s12
@@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
@@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
 ; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s2, s1
 ; GFX10-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
new file mode 100644
index 0000000000000..0b40c81af414d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s
+
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+!0 = !{i32 1}
        
    
    
More information about the llvm-commits
mailing list