[llvm] 0455596 - [AMDGPU] Add DAG ISel support for preloaded kernel arguments

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 25 09:33:28 PDT 2023


Author: Austin Kerbow
Date: 2023-09-25T09:32:59-07:00
New Revision: 0455596e1e7fecb6c76de9eba4e2ffc8772eadc2

URL: https://github.com/llvm/llvm-project/commit/0455596e1e7fecb6c76de9eba4e2ffc8772eadc2
DIFF: https://github.com/llvm/llvm-project/commit/0455596e1e7fecb6c76de9eba4e2ffc8772eadc2.diff

LOG: [AMDGPU] Add DAG ISel support for preloaded kernel arguments

This patch adds the DAG isel changes for kernel argument preloading.
These changes are not usable with older firmware but subsequent patches
in the series will make the codegen backwards compatible. This patch
should only be submitted alongside that subsequent patch.

Preloading here begins from the start of the kernel arguments until the
amount of arguments indicated by the CL flag
amdgpu-kernarg-preload-count.

Aggregates and arguments passed by-ref are not supported.

Special care for the alignment of the kernarg segment is needed as well
as consideration of the alignment of addressable SGPR tuples when we
cannot directly use misaligned large tuples that the arguments are
loaded to.

Reviewed By: bcahoon

Differential Revision: https://reviews.llvm.org/D158579

Added: 
    llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
    llvm/test/CodeGen/AMDGPU/preload-kernargs.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
    llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 1c18cbd855fcef4..de25f9241a50367 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -60,6 +60,7 @@ bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
   return false;
 }
 
+// TODO: Print preload kernargs?
 void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
   for (const auto &FI : ArgInfoMap) {
     OS << "Arguments for " << FI.first->getName() << '\n'
@@ -148,7 +149,7 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
   llvm_unreachable("unexpected preloaded value type");
 }
 
-constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
+AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
   AMDGPUFunctionArgInfo AI;
   AI.PrivateSegmentBuffer
     = ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index f595e469f998412..ee21fe1615be4ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Pass.h"
 
@@ -37,22 +38,19 @@ struct ArgDescriptor {
   bool IsSet : 1;
 
 public:
-  constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
-                bool IsStack = false, bool IsSet = false)
-    : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+  ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false,
+                bool IsSet = false)
+      : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
 
-  static constexpr ArgDescriptor createRegister(Register Reg,
-                                                unsigned Mask = ~0u) {
+  static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
     return ArgDescriptor(Reg, Mask, false, true);
   }
 
-  static constexpr ArgDescriptor createStack(unsigned Offset,
-                                             unsigned Mask = ~0u) {
+  static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
     return ArgDescriptor(Offset, Mask, true, true);
   }
 
-  static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
-                                           unsigned Mask) {
+  static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
     return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
   }
 
@@ -94,6 +92,11 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
   return OS;
 }
 
+struct KernArgPreloadDescriptor : public ArgDescriptor {
+  KernArgPreloadDescriptor() {}
+  SmallVector<MCRegister> Regs;
+};
+
 struct AMDGPUFunctionArgInfo {
   enum PreloadedValue {
     // SGPRS:
@@ -151,10 +154,13 @@ struct AMDGPUFunctionArgInfo {
   ArgDescriptor WorkItemIDY;
   ArgDescriptor WorkItemIDZ;
 
+  // Map the index of preloaded kernel arguments to its descriptor.
+  SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
+
   std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
   getPreloadedValue(PreloadedValue Value) const;
 
-  static constexpr AMDGPUFunctionArgInfo fixedABILayout();
+  static AMDGPUFunctionArgInfo fixedABILayout();
 };
 
 class AMDGPUArgumentUsageInfo : public ImmutablePass {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b2360ce30fd6edb..79c3c957424142d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -205,6 +205,11 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
 
   if (STM.isAmdHsaOS())
     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
+
+  if (MFI.getNumKernargPreloadedSGPRs() > 0) {
+    assert(AMDGPU::hasKernargPreload(STM));
+    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
+  }
 }
 
 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
@@ -417,6 +422,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
     const SIProgramInfo &PI) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const Function &F = MF.getFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   amdhsa::kernel_descriptor_t KernelDescriptor;
   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
@@ -440,6 +446,10 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
     KernelDescriptor.compute_pgm_rsrc3 =
       CurrentProgramInfo.ComputePGMRSrc3GFX90A;
 
+  if (AMDGPU::hasKernargPreload(STM))
+    KernelDescriptor.kernarg_preload =
+        static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
+
   return KernelDescriptor;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index cffc77ba5d18c15..4326f3c3fbe1ae7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1027,7 +1027,8 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
 }
 
 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
-                                           const GCNSubtarget &ST) {
+                                           const GCNSubtarget &ST)
+    : ST(ST) {
   const CallingConv::ID CC = F.getCallingConv();
   const bool IsKernel =
       CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
@@ -1068,30 +1069,35 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
       !ST.flatScratchIsArchitected()) {
     FlatScratchInit = true;
   }
-}
 
-unsigned GCNUserSGPRUsageInfo::getNumUsedUserSGPRs() const {
-  unsigned NumUserSGPRs = 0;
   if (hasImplicitBufferPtr())
-    NumUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
+    NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
 
   if (hasPrivateSegmentBuffer())
-    NumUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
+    NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
 
   if (hasDispatchPtr())
-    NumUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
+    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
 
   if (hasQueuePtr())
-    NumUserSGPRs += getNumUserSGPRForField(QueuePtrID);
+    NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
 
   if (hasKernargSegmentPtr())
-    NumUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
+    NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
 
   if (hasDispatchID())
-    NumUserSGPRs += getNumUserSGPRForField(DispatchIdID);
+    NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
 
   if (hasFlatScratchInit())
-    NumUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
+    NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
+}
+
+void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
+  assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
+  NumKernargPreloadSGPRs += NumSGPRs;
+  NumUsedUserSGPRs += NumSGPRs;
+}
 
-  return NumUserSGPRs;
+unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
+  return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 970ce48de9f47c2..fb50dfd1a346664 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1394,8 +1394,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
 class GCNUserSGPRUsageInfo {
 public:
-  unsigned getNumUsedUserSGPRs() const;
-
   bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
 
   bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
@@ -1410,6 +1408,14 @@ class GCNUserSGPRUsageInfo {
 
   bool hasFlatScratchInit() const { return FlatScratchInit; }
 
+  unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
+
+  unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
+
+  unsigned getNumFreeUserSGPRs();
+
+  void allocKernargPreloadSGPRs(unsigned NumSGPRs);
+
   enum UserSGPRID : unsigned {
     ImplicitBufferPtrID = 0,
     PrivateSegmentBufferID = 1,
@@ -1447,6 +1453,8 @@ class GCNUserSGPRUsageInfo {
   GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
 
 private:
+  const GCNSubtarget &ST;
+
   // Private memory buffer
   // Compute directly in sgpr[0:1]
   // Other shaders indirect 64-bits at sgpr[0:1]
@@ -1463,6 +1471,10 @@ class GCNUserSGPRUsageInfo {
   bool DispatchID = false;
 
   bool FlatScratchInit = false;
+
+  unsigned NumKernargPreloadSGPRs = 0;
+
+  unsigned NumUsedUserSGPRs = 0;
 };
 
 } // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 2787b98575f9181..6b8c03c1620d26b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -830,6 +830,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
   return true;
 }
 
+bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
+    const MCSubtargetInfo &STI) {
+  for (int i = 0; i < 64; ++i) {
+    OS << "\ts_nop 0\n";
+  }
+  return true;
+}
+
+bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
+    const MCSubtargetInfo &STI) {
+  const uint32_t Encoded_s_nop = 0xbf800000;
+  MCStreamer &OS = getStreamer();
+  for (int i = 0; i < 64; ++i) {
+    OS.emitInt32(Encoded_s_nop);
+  }
+  return true;
+}
+
 bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
   const uint32_t Encoded_s_code_end = 0xbf9f0000;
   const uint32_t Encoded_s_nop = 0xbf800000;

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index db43de8fcc5fb4b..55b5246c92100a8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -90,6 +90,11 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   /// \returns True on success, false on failure.
   virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
 
+  /// \returns True on success, false on failure.
+  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
+    return true;
+  }
+
   virtual void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
       const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -154,6 +159,9 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   /// \returns True on success, false on failure.
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
+  /// \returns True on success, false on failure.
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
       const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -215,6 +223,9 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   /// \returns True on success, false on failure.
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
+  /// \returns True on success, false on failure.
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
       const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80fd1d2ea72ca16..f170428b38c49a5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2240,14 +2240,88 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+}
+
+// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
+// sequential starting from the first argument.
+void SITargetLowering::allocatePreloadKernArgSGPRs(
+    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
+    const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+    const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
+  Function &F = MF.getFunction();
+  unsigned LastExplicitArgOffset =
+      MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
+  GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
+  bool InPreloadSequence = true;
+  unsigned InIdx = 0;
+  for (auto &Arg : F.args()) {
+    if (!InPreloadSequence || !Arg.hasInRegAttr())
+      break;
+
+    int ArgIdx = Arg.getArgNo();
+    // Don't preload non-original args or parts not in the current preload
+    // sequence.
+    if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
+                               (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
+      break;
+
+    for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
+           (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
+         InIdx++) {
+      assert(ArgLocs[ArgIdx].isMemLoc());
+      auto &ArgLoc = ArgLocs[InIdx];
+      const Align KernelArgBaseAlign = Align(16);
+      unsigned ArgOffset = ArgLoc.getLocMemOffset();
+      Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+      unsigned NumAllocSGPRs =
+          alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
+
+      // Arg is preloaded into the previous SGPR.
+      if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
+        Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
+            Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
+        continue;
+      }
+
+      unsigned Padding = ArgOffset - LastExplicitArgOffset;
+      unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+      // Check for free user SGPRs for preloading.
+      if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
+          SGPRInfo.getNumFreeUserSGPRs()) {
+        InPreloadSequence = false;
+        break;
+      }
+
+      // Preload this argument.
+      const TargetRegisterClass *RC =
+          TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+      SmallVectorImpl<MCRegister> *PreloadRegs =
+          Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
+
+      if (PreloadRegs->size() > 1)
+        RC = &AMDGPU::SGPR_32RegClass;
+      for (auto &Reg : *PreloadRegs) {
+        assert(Reg);
+        MF.addLiveIn(Reg, RC);
+        CCInfo.AllocateReg(Reg);
+      }
+
+      LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+    }
+  }
+}
+
+void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
+                                           const SIRegisterInfo &TRI,
+                                           SIMachineFunctionInfo &Info) const {
+  // Always allocate this last since it is a synthetic preload.
   if (Info.hasLDSKernelId()) {
     Register Reg = Info.addLDSKernelId();
     MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(Reg);
   }
-
-  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
-  // these from the dispatch pointer.
 }
 
 // Allocate special input registers that are initialized per-wave.
@@ -2554,17 +2628,23 @@ SDValue SITargetLowering::LowerFormalArguments(
     Splits.append(Ins.begin(), Ins.end());
   }
 
+  if (IsKernel)
+    analyzeFormalArgumentsCompute(CCInfo, Ins);
+
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
+    if (IsKernel && Subtarget->hasKernargPreload() &&
+        !Subtarget->needsKernargPreloadBackwardsCompatibility())
+      allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+
+    allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
   } else if (!IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
-  if (IsKernel) {
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
-  } else {
+  if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
   }
@@ -2610,9 +2690,81 @@ SDValue SITargetLowering::LowerFormalArguments(
         continue;
       }
 
-      SDValue Arg = lowerKernargMemParameter(
-        DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
-      Chains.push_back(Arg.getValue(1));
+      SDValue NewArg;
+      if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
+        if (MemVT.getStoreSize() < 4 && Alignment < 4) {
+          // In this case the argument is packed into the previous preload SGPR.
+          int64_t AlignDownOffset = alignDown(Offset, 4);
+          int64_t OffsetDiff = Offset - AlignDownOffset;
+          EVT IntVT = MemVT.changeTypeToInteger();
+
+          const SIMachineFunctionInfo *Info =
+              MF.getInfo<SIMachineFunctionInfo>();
+          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+          Register Reg =
+              Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
+
+          assert(Reg);
+          Register VReg = MRI.getLiveInVirtReg(Reg);
+          SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
+
+          SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
+          SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
+
+          SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
+          ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
+          NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
+                                  Ins[i].Flags.isSExt(), &Ins[i]);
+
+          NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
+        } else {
+          const SIMachineFunctionInfo *Info =
+              MF.getInfo<SIMachineFunctionInfo>();
+          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+          const SmallVectorImpl<MCRegister> &PreloadRegs =
+              Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
+
+          SDValue Copy;
+          if (PreloadRegs.size() == 1) {
+            Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
+            const TargetRegisterClass *RC = MRI.getRegClass(VReg);
+            NewArg = DAG.getCopyFromReg(
+                Chain, DL, VReg,
+                EVT::getIntegerVT(*DAG.getContext(),
+                                  TRI->getRegSizeInBits(*RC)));
+
+          } else {
+            // If the kernarg alignment does not match the alignment of the SGPR
+            // tuple RC that can accommodate this argument, it will be built up
+            // via copies from from the individual SGPRs that the argument was
+            // preloaded to.
+            SmallVector<SDValue, 4> Elts;
+            for (auto Reg : PreloadRegs) {
+              Register VReg = MRI.getLiveInVirtReg(Reg);
+              Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
+              Elts.push_back(Copy);
+            }
+            NewArg =
+                DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                                    PreloadRegs.size()),
+                                   DL, Elts);
+          }
+
+          SDValue CMemVT;
+          if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
+            CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
+          else
+            CMemVT = DAG.getBitcast(MemVT, NewArg);
+          NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
+                                  Ins[i].Flags.isSExt(), &Ins[i]);
+          NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
+        }
+      } else {
+        NewArg =
+            lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
+                                     Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
+      }
+      Chains.push_back(NewArg.getValue(1));
 
       auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
@@ -2622,11 +2774,11 @@ SDValue SITargetLowering::LowerFormalArguments(
         // On SI local pointers are just offsets into LDS, so they are always
         // less than 16-bits.  On CI and newer they could potentially be
         // real pointers, so we can't guarantee their size.
-        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
-                          DAG.getValueType(MVT::i16));
+        NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
+                             DAG.getValueType(MVT::i16));
       }
 
-      InVals.push_back(Arg);
+      InVals.push_back(NewArg);
       continue;
     } else if (!IsEntryFunc && VA.isMemLoc()) {
       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 2cc42b9ac2e1fe1..d717e12d29a514a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -546,6 +546,17 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                             const SIRegisterInfo &TRI,
                             SIMachineFunctionInfo &Info) const;
 
+  void allocatePreloadKernArgSGPRs(CCState &CCInfo,
+                                   SmallVectorImpl<CCValAssign> &ArgLocs,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   MachineFunction &MF,
+                                   const SIRegisterInfo &TRI,
+                                   SIMachineFunctionInfo &Info) const;
+
+  void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
+                           const SIRegisterInfo &TRI,
+                           SIMachineFunctionInfo &Info) const;
+
   void allocateSystemSGPRs(CCState &CCInfo,
                            MachineFunction &MF,
                            SIMachineFunctionInfo &Info,

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index bbee6d77733b52b..7ab5ccde4faf4e3 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -243,6 +243,33 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
   return ArgInfo.LDSKernelId.getRegister();
 }
 
+SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
+    const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
+    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
+  assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
+         "Preload kernel argument allocated twice.");
+  NumUserSGPRs += PaddingSGPRs;
+  // If the available register tuples are aligned with the kernarg to be
+  // preloaded use that register, otherwise we need to use a set of SGPRs and
+  // merge them.
+  Register PreloadReg =
+      TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
+  if (PreloadReg &&
+      (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
+    ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg);
+    NumUserSGPRs += AllocSizeDWord;
+  } else {
+    for (unsigned I = 0; I < AllocSizeDWord; ++I) {
+      ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(getNextUserSGPR());
+      NumUserSGPRs++;
+    }
+  }
+
+  // Track the actual number of SGPRs that HW will preload to.
+  UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
+  return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
+}
+
 void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
                                              uint64_t Size, Align Alignment) {
   // Skip if it is an entry function or the register is already added.
@@ -570,6 +597,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
     return true;
   };
 
+  // TODO: Need to serialize kernarg preloads.
   bool Any = false;
   Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
   Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 51d5bab7a142961..7ff50c80081d30a 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -21,6 +21,7 @@
 #include "SIInstrInfo.h"
 #include "SIModeRegisterDefaults.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/raw_ostream.h"
@@ -593,6 +594,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return PrologEpilogSGPRSpills;
   }
 
+  GCNUserSGPRUsageInfo &getUserSGPRInfo() { return UserSGPRInfo; }
+
   const GCNUserSGPRUsageInfo &getUserSGPRInfo() const { return UserSGPRInfo; }
 
   void addToPrologEpilogSGPRSpills(Register Reg,
@@ -727,6 +730,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   Register addFlatScratchInit(const SIRegisterInfo &TRI);
   Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
   Register addLDSKernelId();
+  SmallVectorImpl<MCRegister> *
+  addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
+                      unsigned AllocSizeDWord, int KernArgIdx,
+                      int PaddingSGPRs);
 
   /// Increment user SGPRs used for padding the argument list only.
   Register addReservedUserSGPR() {
@@ -872,6 +879,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return NumUserSGPRs + NumSystemSGPRs;
   }
 
+  unsigned getNumKernargPreloadedSGPRs() const {
+    return UserSGPRInfo.getNumKernargPreloadSGPRs();
+  }
+
   Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
     return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 314e9b2b99bfbce..66bff4a14cac84b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -22,7 +22,7 @@ declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
 
-define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -69,7 +69,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -105,7 +105,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -152,7 +152,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -188,7 +188,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -235,7 +235,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -271,7 +271,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -318,7 +318,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -354,7 +354,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -401,7 +401,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -437,7 +437,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -484,7 +484,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -520,7 +520,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -567,7 +567,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -603,7 +603,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -650,7 +650,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -686,7 +686,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -733,7 +733,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -769,7 +769,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -816,7 +816,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -852,7 +852,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -899,7 +899,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -935,7 +935,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -982,7 +982,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34

diff  --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index 2e56839587f329e..43a161840fd258e 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -541,7 +541,7 @@ define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1)
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.swap:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -563,7 +563,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) inr
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.add:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -585,7 +585,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) inre
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.sub:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -607,7 +607,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) inre
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.smin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -629,7 +629,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) inr
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.smax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -651,7 +651,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) inr
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.umin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -673,7 +673,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) inr
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.umax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -695,7 +695,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) inr
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.and:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -717,7 +717,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) inre
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.or:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -739,7 +739,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) inreg
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.xor:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -761,7 +761,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) inre
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.inc:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -783,7 +783,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) inre
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.dec:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -805,7 +805,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) inre
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.cmpswap:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -828,7 +828,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8)
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.fadd:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -852,7 +852,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) inr
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.fmin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -877,7 +877,7 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) inr
   ret void
 }
 
-define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) {
+define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) {
 ; CHECK-LABEL: buffer.ptr.atomic.fmax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x34

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index cf6a5e7b439ea25..58b0a0f56918b0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -24,7 +24,7 @@ declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
 declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
 
-define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -71,7 +71,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -107,7 +107,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -154,7 +154,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -190,7 +190,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -237,7 +237,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -273,7 +273,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -320,7 +320,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -356,7 +356,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -403,7 +403,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -439,7 +439,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -486,7 +486,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -522,7 +522,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -569,7 +569,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -605,7 +605,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -652,7 +652,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -688,7 +688,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -735,7 +735,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -771,7 +771,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -818,7 +818,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -854,7 +854,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -901,7 +901,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -937,7 +937,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -984,7 +984,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -1020,7 +1020,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -1067,7 +1067,7 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
+define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34

diff  --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
new file mode 100644
index 000000000000000..75feac35dacd847
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN %s
+
+; GCN: preload_kernarg_header
+; GCN-COUNT-64: s_nop 0
+define amdgpu_kernel void @preload_kernarg_header(ptr %arg) {
+    store ptr %arg, ptr %arg
+    ret void
+}
+
+; GCN: non_kernel_function
+; GCN-NOT: s_nop 0
+; GCN: flat_store
+define void @non_kernel_function(ptr %arg) {
+    store ptr %arg, ptr %arg
+    ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
new file mode 100644
index 000000000000000..57980214e58e2b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -0,0 +1,5421 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-1 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-2 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-4 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-8 %s
+
+define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
+; NO-PRELOAD-LABEL: ptr1_i8:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: ptr1_i8:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: ptr1_i8:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: ptr1_i8:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: ptr1_i8:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  %ext = zext i8 %arg0 to i32
+  store i32 %ext, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
+; NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: ptr1_i8_zext_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: ptr1_i8_zext_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  %ext = zext i8 %arg0 to i32
+  store i32 %ext, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
+; NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: ptr1_i16_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: ptr1_i16_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  %ext = zext i16 %arg0 to i32
+  store i32 %ext, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
+; NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: ptr1_i32_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: ptr1_i32_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store i32 %arg0, ptr addrspace(1) %out
+  ret void
+}
+
+; Check alignment on the second preloaded arg.
+
+define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s4
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  %add = add i32 %arg0, %arg1
+  store i32 %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
+; NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; NO-PRELOAD-NEXT:    s_and_b32 s1, s4, 0xffff
+; NO-PRELOAD-NEXT:    s_add_i32 s0, s1, s0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
+; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
+; PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
+; PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
+; PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  %ext = zext i16 %arg0 to i32
+  %ext1 = zext i16 %arg1 to i32
+  %add = add i32 %ext, %ext1
+  store i32 %add, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
+; NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <2 x i8> %in, ptr addrspace(1) %out
+  ret void
+}
+
+; Don't try to preload byref args.
+
+define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; NO-PRELOAD-LABEL: byref_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: byref_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
+; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: byref_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: byref_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: byref_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; PRELOAD-8-NEXT:    s_endpgm
+  %in = load i32, ptr addrspace(4) %in.byref
+  store volatile i32 %in, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; TODO: Should do partial preload in cases like these where only part of the arg
+; can be preloaded.
+
+define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
+; NO-PRELOAD-LABEL: v8i32_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; NO-PRELOAD-NEXT:    s_nop 1
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: v8i32_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-1-NEXT:    s_nop 1
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: v8i32_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-2-NEXT:    s_nop 1
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: v8i32_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-4-NEXT:    s_nop 1
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: v8i32_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-8-NEXT:    s_nop 1
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <8 x i32> %in, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
+; NO-PRELOAD-LABEL: v3i16_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: v3i16_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: v3i16_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: v3i16_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: v3i16_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <3 x i16> %in, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
+; NO-PRELOAD-LABEL: v3i32_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: v3i32_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: v3i32_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: v3i32_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: v3i32_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <3 x i32> %in, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
+; NO-PRELOAD-LABEL: v3f32_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: v3f32_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: v3f32_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: v3f32_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: v3f32_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <3 x float> %in, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
+; NO-PRELOAD-LABEL: v5i8_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
+; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: v5i8_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
+; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: v5i8_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: v5i8_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: v5i8_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <5 x i8> %in, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
+; NO-PRELOAD-LABEL: v5f64_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
+; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; NO-PRELOAD-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; NO-PRELOAD-NEXT:    s_nop 1
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: v5f64_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-1-NEXT:    s_nop 1
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: v5f64_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-2-NEXT:    s_nop 1
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: v5f64_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-4-NEXT:    s_nop 1
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: v5f64_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; PRELOAD-8-NEXT:    s_nop 1
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <5 x double> %in, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
+; NO-PRELOAD-LABEL: v8i8_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: v8i8_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: v8i8_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
+; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: v8i8_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
+; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
+; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: v8i8_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
+; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store <8 x i8> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) {
+; NO-PRELOAD-LABEL: i64_kernel_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: i64_kernel_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: i64_kernel_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: i64_kernel_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: i64_kernel_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store i64 %a, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) {
+; NO-PRELOAD-LABEL: f64_kernel_preload_arg:
+; NO-PRELOAD:       ; %bb.0:
+; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; NO-PRELOAD-NEXT:    s_endpgm
+;
+; PRELOAD-1-LABEL: f64_kernel_preload_arg:
+; PRELOAD-1:         s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:    s_nop 0
+; PRELOAD-1-NEXT:  ; %bb.0:
+; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-1-NEXT:    s_endpgm
+;
+; PRELOAD-2-LABEL: f64_kernel_preload_arg:
+; PRELOAD-2:         s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:    s_nop 0
+; PRELOAD-2-NEXT:  ; %bb.0:
+; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-2-NEXT:    s_endpgm
+;
+; PRELOAD-4-LABEL: f64_kernel_preload_arg:
+; PRELOAD-4:         s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:    s_nop 0
+; PRELOAD-4-NEXT:  ; %bb.0:
+; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-4-NEXT:    s_endpgm
+;
+; PRELOAD-8-LABEL: f64_kernel_preload_arg:
+; PRELOAD-8:         s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:    s_nop 0
+; PRELOAD-8-NEXT:  ; %bb.0:
+; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; PRELOAD-8-NEXT:    s_endpgm
+  store double %in, ptr addrspace(1) %out
+  ret void
+}


        


More information about the llvm-commits mailing list