[llvm] [AMDGPU] Support preloading hidden kernel arguments (PR #98861)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Sun Aug 4 17:31:43 PDT 2024


https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/98861

>From 01d38d00edc74e97e1e0cbaf3976a74711dcc5f4 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Sun, 14 Jul 2024 14:43:12 -0700
Subject: [PATCH] [AMDGPU] Support preloading hidden kernel arguments

Adds hidden kernel arguments to the function signature and marks them
inreg if they should be preloaded into user SGPRs. The normal kernarg
preloading logic then takes over with some additional checks for the
correct implicitarg_ptr alignment.

Special care is needed so that metadata for the hidden arguments is not
added twice when generating the code object.
---
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |    8 +-
 .../AMDGPU/AMDGPULowerKernelArguments.cpp     |  185 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |    5 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   26 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |    9 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |    3 +
 .../AMDGPU/preload-implicit-kernargs.ll       |  698 ++++
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  | 3094 +++++++++--------
 8 files changed, 2648 insertions(+), 1380 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 26116bfa3c2feb..5a9f87d62ca8cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -260,8 +260,14 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
   auto &Func = MF.getFunction();
   unsigned Offset = 0;
   auto Args = HSAMetadataDoc->getArrayNode();
-  for (auto &Arg : Func.args())
+  for (auto &Arg : Func.args()) {
+    if (Func.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
+                                                     Arg.getArgNo(),
+                                                 "amdgpu-hidden-argument"))
+      continue;
+
     emitKernelArg(Arg, Offset, Args);
+  }
 
   emitHiddenKernelArgs(MF, Offset, Args);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 83cce6021693a2..e2420bf8a720ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -13,6 +13,8 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -31,9 +33,97 @@ class PreloadKernelArgInfo {
   const GCNSubtarget &ST;
   unsigned NumFreeUserSGPRs;
 
-public:
-  SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
+  enum HiddenArg : unsigned {
+    HIDDEN_BLOCK_COUNT_X,
+    HIDDEN_BLOCK_COUNT_Y,
+    HIDDEN_BLOCK_COUNT_Z,
+    HIDDEN_GROUP_SIZE_X,
+    HIDDEN_GROUP_SIZE_Y,
+    HIDDEN_GROUP_SIZE_Z,
+    HIDDEN_REMAINDER_X,
+    HIDDEN_REMAINDER_Y,
+    HIDDEN_REMAINDER_Z,
+    END_HIDDEN_ARGS
+  };
+
+  struct HiddenArgInfo {
+    uint8_t Offset;
+    uint8_t Size;
+    const char *Name;
+  };
+
+  static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
+      {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
+      {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
+      {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
+      {18, 2, "_hidden_remainder_x"},  {20, 2, "_hidden_remainder_y"},
+      {22, 2, "_hidden_remainder_z"}};
+
+  static HiddenArg getHiddenArgIndexFromOffset(unsigned Offset) {
+    for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
+      if (HiddenArgs[I].Offset == Offset)
+        return static_cast<HiddenArg>(I);
+
+    llvm_unreachable("Unexpected hidden argument offset.");
+  }
+
+  static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
+    if (HA < END_HIDDEN_ARGS)
+      return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
+
+    llvm_unreachable("Unexpected hidden argument.");
+  }
+
+  static const char *getHiddenArgName(HiddenArg HA) {
+    if (HA < END_HIDDEN_ARGS) {
+      return HiddenArgs[HA].Name;
+    }
+    llvm_unreachable("Unexpected hidden argument.");
+  }
+
+  Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
+    FunctionType *FT = F.getFunctionType();
+    LLVMContext &Ctx = F.getParent()->getContext();
+    SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
+    for (unsigned I = 0; I <= LastPreloadIndex; ++I)
+      FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
+
+    FunctionType *NFT =
+        FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
+    Function *NF =
+        Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
+
+    NF->copyAttributesFrom(&F);
+    NF->copyMetadata(&F, 0);
+    NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
+
+    F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+    NF->takeName(&F);
+    NF->splice(NF->begin(), &F);
+
+    Function::arg_iterator NFArg = NF->arg_begin();
+    for (Argument &Arg : F.args()) {
+      Arg.replaceAllUsesWith(&*NFArg);
+      NFArg->takeName(&Arg);
+      ++NFArg;
+    }
+
+    AttrBuilder AB(Ctx);
+    AB.addAttribute(Attribute::InReg);
+    AB.addAttribute("amdgpu-hidden-argument");
+    AttributeList AL = NF->getAttributes();
+    for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
+      AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
+      NFArg++->setName(getHiddenArgName(HiddenArg(I)));
+    }
+
+    NF->setAttributes(AL);
+    F.replaceAllUsesWith(NF);
+
+    return NF;
+  }
 
+public:
   PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
     setInitialFreeUserSGPRsCount();
   }
@@ -64,6 +154,89 @@ class PreloadKernelArgInfo {
     NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
     return true;
   }
+
+  // Try to allocate SGPRs to preload implicit kernel arguments.
+  void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
+                                       IRBuilder<> &Builder) {
+    StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
+    Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
+    if (!ImplicitArgPtr)
+      return;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    // Pair is the load and the load offset.
+    SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+    for (auto *U : ImplicitArgPtr->users()) {
+      Instruction *CI = dyn_cast<Instruction>(U);
+      if (!CI || CI->getParent()->getParent() != &F)
+        continue;
+
+      for (auto *U : CI->users()) {
+        int64_t Offset = 0;
+        auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+        if (!Load) {
+          if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+            continue;
+
+          Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+        }
+
+        if (!Load || !Load->isSimple())
+          continue;
+
+        // FIXME: Expand to handle 64-bit implicit args and large merged loads.
+        unsigned LoadSize = Load->getType()->getScalarSizeInBits();
+        if (LoadSize != 32 && LoadSize != 16)
+          continue;
+
+        ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+      }
+    }
+
+    if (ImplicitArgLoads.empty())
+      return;
+
+    // Allocate loads in order of offset. We need to be sure that the implicit
+    // argument can actually be preloaded.
+    std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+              [](const std::pair<LoadInst *, unsigned> &A,
+                 const std::pair<LoadInst *, unsigned> &B) {
+                return A.second < B.second;
+              });
+
+    uint64_t LastExplicitArgOffset = ImplicitArgsBaseOffset;
+    // If we fail to preload any implicit argument we know we don't have SGPRs
+    // to preload any subsequent ones with larger offsets. Find the first
+    // argument that we cannot preload.
+    auto *PreloadEnd = std::find_if(
+        ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+        [&](const std::pair<LoadInst *, unsigned> &Load) {
+          unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
+          unsigned LoadOffset = Load.second;
+          if (!tryAllocPreloadSGPRs(LoadSize,
+                                    LoadOffset + ImplicitArgsBaseOffset,
+                                    LastExplicitArgOffset))
+            return true;
+
+          LastExplicitArgOffset = LoadOffset + LoadSize;
+          return false;
+        });
+
+    if (PreloadEnd == ImplicitArgLoads.begin())
+      return;
+
+    unsigned LastHiddenArgIndex = getHiddenArgIndexFromOffset(PreloadEnd[-1].second);
+    Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
+    assert(NF);
+    for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
+      LoadInst *LoadInst = I->first;
+      unsigned LoadOffset = I->second;
+      unsigned HiddenArgIndex = getHiddenArgIndexFromOffset(LoadOffset);
+      unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
+      Argument *Arg = NF->getArg(Index);
+      LoadInst->replaceAllUsesWith(Arg);
+    }
+  }
 };
 
 class AMDGPULowerKernelArguments : public FunctionPass {
@@ -281,6 +454,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
   KernArgSegment->addRetAttr(
       Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
+  if (InPreloadSequence) {
+    uint64_t ImplicitArgsBaseOffset =
+        alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
+        BaseOffset;
+    PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
+                                                Builder);
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 2e1bdf46924783..04863f1f8b4dcf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -611,6 +611,11 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
   MaxAlign = Align(1);
 
   for (const Argument &Arg : F.args()) {
+    if (F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
+                                                  Arg.getArgNo(),
+                                              "amdgpu-hidden-argument"))
+      continue;
+
     const bool IsByRef = Arg.hasByRefAttr();
     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
     Align Alignment = DL.getValueOrABITypeAlignment(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4d58e519081825..5d6ce6d7340179 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2511,19 +2511,20 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
   bool InPreloadSequence = true;
   unsigned InIdx = 0;
+  bool AlignedForImplictArgs = false;
   for (auto &Arg : F.args()) {
     if (!InPreloadSequence || !Arg.hasInRegAttr())
       break;
 
-    int ArgIdx = Arg.getArgNo();
+    unsigned ArgIdx = Arg.getArgNo();
     // Don't preload non-original args or parts not in the current preload
     // sequence.
-    if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
-                               (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
+    if (InIdx < Ins.size() &&
+        (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
       break;
 
     for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
-           (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
+           Ins[InIdx].getOrigArgIndex() == ArgIdx;
          InIdx++) {
       assert(ArgLocs[ArgIdx].isMemLoc());
       auto &ArgLoc = ArgLocs[InIdx];
@@ -2533,6 +2534,23 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
       unsigned NumAllocSGPRs =
           alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
 
+      // Add padding SPGR to fix alignment for hidden arguments.
+      if (!AlignedForImplictArgs &&
+          F.getAttributes().hasAttributeAtIndex(AttributeList::FirstArgIndex +
+                                                    Arg.getArgNo(),
+                                                "amdgpu-hidden-argument")) {
+        unsigned OffsetBefore = LastExplicitArgOffset;
+        LastExplicitArgOffset = alignTo(
+            LastExplicitArgOffset, Subtarget->getAlignmentForImplicitArgPtr());
+        if (OffsetBefore != LastExplicitArgOffset) {
+          unsigned PaddingSGPRs =
+              alignTo(LastExplicitArgOffset - OffsetBefore, 4) / 4;
+          Info.allocateUserSGPRs(PaddingSGPRs);
+          ArgOffset += PaddingSGPRs * 4;
+        }
+        AlignedForImplictArgs = true;
+      }
+
       // Arg is preloaded into the previous SGPR.
       if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
         Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c5251826b117cb..c2e734f1386d02 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -278,6 +278,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
 }
 
+bool SIMachineFunctionInfo::allocateUserSGPRs(
+    unsigned Number) {
+  if (Number <= getNumUserSGPRs())
+    return false;
+
+  NumUserSGPRs = Number;
+  return true;
+}
+
 void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
                                              uint64_t Size, Align Alignment) {
   // Skip if it is an entry function or the register is already added.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7af5e7388f841e..f2ff919a4bc0ef 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -760,6 +760,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
                       unsigned AllocSizeDWord, int KernArgIdx,
                       int PaddingSGPRs);
 
+  /// Reserve up to \p Number of user SGPRs.
+  bool allocateUserSGPRs(unsigned Number);
+
   /// Increment user SGPRs used for padding the argument list only.
   Register addReservedUserSGPR() {
     Register Next = getNextUserSGPR();
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
new file mode 100644
index 00000000000000..4b0b54a06ccde6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -0,0 +1,698 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor < %s| llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=16 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 | FileCheck -check-prefixes=GFX940-PRELOAD %s
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=16 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a | FileCheck -check-prefixes=GFX90a-PRELOAD %s
+
+define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0xc
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s10
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-NEXT:    s_add_i32 s0, s5, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-NEXT:    s_add_i32 s0, s9, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  %ext = zext i8 %val to i32
+  %add = add i32 %load, %ext
+  store i32 %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s2, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
+  %load_x = load i32, ptr addrspace(4) %gep_x
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load_y = load i32, ptr addrspace(4) %gep_y
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load_z = load i32, ptr addrspace(4) %gep_z
+  %ins.0 =  insertelement <3 x i32> undef, i32 %load_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[0:1] offset:20
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:24
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[4:5] offset:20
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:24
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s1, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[0:1] offset:26
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:30
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s1, s8, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s4, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[4:5] offset:26
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:30
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s1, s12, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index a547c258e3921d..6e6daca1363bc2 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,18 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=2 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=8 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
 
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=2 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=8 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
 
-define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -23,51 +19,27 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 {
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -78,56 +50,32 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 {
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xff
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xff
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -138,51 +86,29 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -193,56 +119,34 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -253,51 +157,27 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -308,56 +188,32 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xffff
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xffff
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -367,47 +223,25 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -417,52 +251,30 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i32 %arg0, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 {
-; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
@@ -474,55 +286,29 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s5, s4
+; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s5, s4
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x0
@@ -534,60 +320,34 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s6, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s6, s10
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -600,59 +360,33 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
 ; GFX940-PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -665,56 +399,30 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s1, s8, 0xffff
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   %ext1 = zext i16 %arg1 to i32
@@ -723,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
   ret void
 }
 
-define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -734,47 +442,29 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -784,52 +474,34 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <2 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
 
-define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 {
-; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -843,63 +515,37 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x100
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x100
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
@@ -913,59 +559,33 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
+; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
+; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %in = load i32, ptr addrspace(4) %in.byref
@@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 }
 
 
-define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
@@ -995,83 +615,47 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i32_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i32_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
+; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_nop 1
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i32_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i32_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
+; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_nop 1
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1090,87 +674,51 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
+; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
 ; GFX90a-PRELOAD-2-NEXT:    s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
+; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
 ; GFX90a-PRELOAD-8-NEXT:    s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1181,51 +729,29 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1236,55 +762,33 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1296,55 +800,29 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1356,59 +834,33 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1420,55 +872,29 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1480,59 +906,33 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1543,51 +943,43 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-4-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1598,55 +990,47 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -1668,95 +1052,53 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5f64_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5f64_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
+; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
 ; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_nop 1
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5f64_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5f64_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
+; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
 ; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_nop 1
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -1778,99 +1120,57 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
+; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
 ; GFX90a-PRELOAD-2-NEXT:    s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
+; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
 ; GFX90a-PRELOAD-8-NEXT:    s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -1879,43 +1179,57 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -1924,46 +1238,58 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) #0 {
+define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) {
 ; GFX940-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
@@ -1974,44 +1300,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
 ; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
@@ -2024,50 +1328,28 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i64 %a, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) #0 {
+define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) {
 ; GFX940-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
@@ -2078,44 +1360,22 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-1:       ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-2:       ; %bb.0:
-; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-4:       ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-8:       ; %bb.0:
-; GFX940-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
 ; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
@@ -2128,47 +1388,1135 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:       ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-2:       ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:       ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-8:       ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
 ; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store double %in, ptr addrspace(1) %out
   ret void
 }
 
-attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) {
+; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) {
+; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <2 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <3 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <6 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s10, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[10:11] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s10, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[10:11] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  store <7 x bfloat> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) {
+; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i1 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) {
+; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store fp128 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x i8> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[2:3] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[6:7] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x half> %in, ptr addrspace(1) %out
+  ret void
+}
+
+; Test when previous argument was not dword aligned.
+define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0xc
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i32 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s7, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <3 x i32> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i16 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <2 x i8> %in2, ptr addrspace(1) %out2
+  ret void
+}



More information about the llvm-commits mailing list