[llvm] [AMDGPU] Add support for preloading hidden groupsize args (PR #83817)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 18 01:01:50 PDT 2024


https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/83817

>From 52595aebdcf146d3a3bbe24724c2422342161448 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Mon, 4 Mar 2024 01:08:55 -0800
Subject: [PATCH] [AMDGPU] Add support for preloading implicit kernel arguments

Implicit arguments may be preloaded into User SGPRs via the same
mechanism as explicit arguments if their offsets within the kernarg
segment fall within the range of available registers. Lowering of these
implicit arguments may happen early so the implementation here follows
the same concept and is mostly agnostic to which values are being
loaded, and instead only cares about offsets from the implicitarg
pointer and the size of the values being used. Unlike preloading of
explicit arguments there are not restrictions on exactly which implicit
arguments are used and whether there is a unbroken sequence of used
arguments, but instead this will attempt to preload anything that falls
within the range of available User SGPRs on the target HW.

A limitation of this patch is that it only supports i16/i32 arguments,
but like other details of preloading kernargs for both implicit and
explicit arguments this is likely to be expanded and changed in the near
future.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   25 +-
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.h   |    1 +
 .../AMDGPU/AMDGPULowerKernelArguments.cpp     |   94 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  128 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   19 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |   14 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |    2 +
 .../AMDGPU/preload-implict-kernargs.ll        | 2310 +++++++++++++++++
 8 files changed, 2575 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819d2..3b5059ad1331e8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -12,8 +12,8 @@
 
 def global_ptr_ty : LLVMQualPointerType<1>;
 
-class AMDGPUReadPreloadRegisterIntrinsic
-  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
+class AMDGPUReadPreloadRegisterIntrinsic<LLVMType type>
+  : DefaultAttrsIntrinsic<[type], [], [IntrNoMem, IntrSpeculatable]>;
 
 class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>;
@@ -27,10 +27,10 @@ class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
 
 let TargetPrefix = "r600" in {
 
-multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
-  def _x : AMDGPUReadPreloadRegisterIntrinsic;
-  def _y : AMDGPUReadPreloadRegisterIntrinsic;
-  def _z : AMDGPUReadPreloadRegisterIntrinsic;
+multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz<LLVMType type> {
+  def _x : AMDGPUReadPreloadRegisterIntrinsic<type>;
+  def _y : AMDGPUReadPreloadRegisterIntrinsic<type>;
+  def _z : AMDGPUReadPreloadRegisterIntrinsic<type>;
 }
 
 multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
@@ -46,8 +46,8 @@ defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
 defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
                           <"__builtin_r600_read_tgid">;
 
-defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
-defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
+defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
 
 def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
   Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
@@ -138,10 +138,17 @@ let TargetPrefix = "amdgcn" in {
 // ABI Special Intrinsics
 //===----------------------------------------------------------------------===//
 
-defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
+defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz<llvm_i32_ty>;
 defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
                                <"__builtin_amdgcn_workgroup_id">;
 
+// This intrinsic is used to track the user SGPRs that hold implicit kernel
+// arguments. The i32 argument for this intrinsic represents the offset in
+// bytes from the value pointed to by the implicitarg pointer.
+def int_amdgcn_preload_implicitarg :
+  DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable]>;
+
 def int_amdgcn_dispatch_ptr :
   DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
   [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8c4..e6aed12673c941 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -95,6 +95,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
 struct KernArgPreloadDescriptor : public ArgDescriptor {
   KernArgPreloadDescriptor() {}
   SmallVector<MCRegister> Regs;
+  unsigned ByteOffset;
 };
 
 struct AMDGPUFunctionArgInfo {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index bc58407a73294c..e5511981563369 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -13,11 +13,14 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Target/TargetMachine.h"
+#include <algorithm>
 
 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
 
@@ -32,8 +35,6 @@ class PreloadKernelArgInfo {
   unsigned NumFreeUserSGPRs;
 
 public:
-  SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
-
   PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
     setInitialFreeUserSGPRsCount();
   }
@@ -64,6 +65,86 @@ class PreloadKernelArgInfo {
     NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
     return true;
   }
+
+  // Try to allocate SGPRs to preload implicit kernel arguments.
+  void tryAllocImplicitArgPreloadSGPRs(unsigned ImplicitArgsBaseOffset,
+                                       IRBuilder<> &Builder) {
+    IntrinsicInst *ImplicitArgPtr = nullptr;
+    for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+      for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+        if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I))
+          if (CI->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr) {
+            ImplicitArgPtr = CI;
+            break;
+          }
+      }
+    }
+
+    if (!ImplicitArgPtr)
+      return;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    // Pair is the load and the load offset.
+    SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+    for (auto *U : ImplicitArgPtr->users()) {
+      if (!U->hasOneUse())
+        continue;
+
+      int64_t Offset = 0;
+      auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+      if (!Load) {
+        if (GetPointerBaseWithConstantOffset(U, Offset, DL) != ImplicitArgPtr)
+          continue;
+        Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+      }
+
+      if (!Load || !Load->isSimple())
+        continue;
+
+      // FIXME: Expand to handle 64-bit implicit args and large merged loads.
+      if (Load->getType() != Builder.getInt16Ty() &&
+          Load->getType() != Builder.getInt32Ty())
+        continue;
+
+      ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+    }
+
+    if (ImplicitArgLoads.empty())
+      return;
+
+    // Allocate loads in order of offset. We need to be sure that the implicit
+    // argument can actually be prelaoded.
+    std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+              [](const std::pair<LoadInst *, unsigned> &A,
+                 const std::pair<LoadInst *, unsigned> &B) {
+                return A.second < B.second;
+              });
+
+    unsigned LastExplicitArgOffset = ImplicitArgsBaseOffset;
+    bool HasPreloadImplicitArgs = false;
+    for (const auto &Load : ImplicitArgLoads) {
+      LoadInst *LoadInst = Load.first;
+      Type *LoadType = LoadInst->getType();
+      auto LoadOffset = Load.second;
+      unsigned LoadSize = DL.getTypeStoreSize(LoadType);
+      // If we fail to preload any implicit argument we know we don't have SGPRs
+      // to preload any subsequent ones with larger offsets.
+      if (!tryAllocPreloadSGPRs(LoadSize, LoadOffset + ImplicitArgsBaseOffset,
+                                LastExplicitArgOffset))
+        break;
+
+      HasPreloadImplicitArgs = true;
+      LastExplicitArgOffset = LoadOffset + LoadSize;
+      llvm::Value *LoadOffsetValue =
+          llvm::ConstantInt::get(Builder.getInt32Ty(), LoadOffset);
+      CallInst *PreloadIntrin = Builder.CreateIntrinsic(
+          Intrinsic::amdgcn_preload_implicitarg, {LoadType}, {LoadOffsetValue});
+      LoadInst->replaceAllUsesWith(PreloadIntrin);
+    }
+
+    if (HasPreloadImplicitArgs)
+      F.addFnAttr("amdgpu-preload-implicitargs");
+  }
 };
 
 class AMDGPULowerKernelArguments : public FunctionPass {
@@ -282,6 +363,15 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
   KernArgSegment->addRetAttr(
       Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
+  if (InPreloadSequence) {
+    // Alignment for first implicit arg is 4 from hidden_block_count_x.
+    const unsigned FirstImplicitArgAlignment = 4;
+    uint64_t ImplicitArgsBaseOffset =
+        alignTo(ExplicitArgOffset, Align(FirstImplicitArgAlignment)) +
+        BaseOffset;
+    PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset, Builder);
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5ccf21f76015de..2180caad6be3dd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2464,8 +2464,8 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
   // these from the dispatch pointer.
 }
 
-// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
-// sequential starting from the first argument.
+// Allocate pre-loaded kernel arguments. Preloaded arguments must be
+// sequential and preloading must also start from the first argument.
 void SITargetLowering::allocatePreloadKernArgSGPRs(
     CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
     const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
@@ -2476,6 +2476,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
   bool InPreloadSequence = true;
   unsigned InIdx = 0;
+  const Align KernelArgBaseAlign = Align(16);
   for (auto &Arg : F.args()) {
     if (!InPreloadSequence || !Arg.hasInRegAttr())
       break;
@@ -2492,7 +2493,6 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
          InIdx++) {
       assert(ArgLocs[ArgIdx].isMemLoc());
       auto &ArgLoc = ArgLocs[InIdx];
-      const Align KernelArgBaseAlign = Align(16);
       unsigned ArgOffset = ArgLoc.getLocMemOffset();
       Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
       unsigned NumAllocSGPRs =
@@ -2531,6 +2531,97 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
       LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
     }
   }
+
+  if (Info.hasPreloadImplicitArgs()) {
+    unsigned ImplicitArgsBaseOffset = 0;
+    // Find the offset of the first implicit argument.
+    for (auto &Arg : MF.getFunction().args()) {
+      Type *Ty;
+      MaybeAlign Align;
+      if (Arg.hasByRefAttr()) {
+        Ty = Arg.getParamByRefType();
+        Align = Arg.getParamAlign();
+      } else {
+        Ty = Arg.getType();
+        Align = MF.getDataLayout().getABITypeAlign(Ty);
+      }
+      auto Size = MF.getDataLayout().getTypeAllocSize(Ty);
+      ImplicitArgsBaseOffset = alignTo(ImplicitArgsBaseOffset, *Align);
+      ImplicitArgsBaseOffset += Size;
+    }
+    ImplicitArgsBaseOffset = alignTo(
+        ImplicitArgsBaseOffset, Subtarget->getAlignmentForImplicitArgPtr());
+    unsigned ImplicitArgBaseSGPROffset = alignTo(ImplicitArgsBaseOffset, 4) / 4;
+    assert(ImplicitArgBaseSGPROffset <
+           AMDGPU::getMaxNumUserSGPRs(MF.getSubtarget()));
+
+    // Search for implicitarg preload intrinsics.
+    // FIXME: Should we use metadata to track the used implicitarg offsets?
+    SmallVector<unsigned, 4> ImplicitArgPreloadIntrins;
+    for (auto &BB : F) {
+      for (auto &Inst : BB) {
+        if (auto *CI = dyn_cast<CallInst>(&Inst)) {
+          if (CI->getIntrinsicID() == Intrinsic::amdgcn_preload_implicitarg) {
+            unsigned Offset =
+                cast<ConstantInt>(CI->getArgOperand(0))->getZExtValue();
+            ImplicitArgPreloadIntrins.push_back(Offset);
+          }
+        }
+      }
+    }
+
+    // Sort ImplicitArgPreloadIntrins by offset.
+    std::sort(ImplicitArgPreloadIntrins.begin(),
+              ImplicitArgPreloadIntrins.end());
+
+    // Allocate padding user SGPRs up to the beginning of the implicit
+    // arguments.
+    Info.allocateUserSGPRs(
+        ImplicitArgBaseSGPROffset +
+        (Info.getUserSGPRInfo().getNumUsedUserSGPRs() -
+         Info.getUserSGPRInfo().getNumKernargPreloadSGPRs()));
+
+    // Finally allocate SGPRs for preloaded implicit arguments.
+    unsigned AllocatedSGPRs = ImplicitArgBaseSGPROffset;
+    unsigned LastArgIdx = 0;
+    for (auto ImplicitArgOffset : ImplicitArgPreloadIntrins) {
+      // Offset from start of kernel args.
+      unsigned ArgOffset = ImplicitArgsBaseOffset + ImplicitArgOffset;
+      // The offset serves as a unique index to track the regsiter that the
+      // implicit argument is loaded to. Not expecting duplicate loads with the
+      // same offset.
+      unsigned ImplicitArgIdx = F.arg_size() + ImplicitArgOffset;
+      // FIXME: Is it valid to have multiple loads with the same offset
+      // relative to the implicitarg pointer?
+      assert(!Info.getArgInfo().PreloadKernArgs.count(ArgOffset));
+      Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+      unsigned Padding = alignTo(ArgOffset, 4) / 4 - AllocatedSGPRs;
+      if (Padding == 0 && Alignment < 4) {
+        // Argument is preloaded into the previous SGPR.
+        auto &KernargPreloadInfo =
+            Info.getArgInfo().PreloadKernArgs[ImplicitArgIdx];
+        KernargPreloadInfo.Regs.push_back(
+            Info.getArgInfo().PreloadKernArgs[LastArgIdx].Regs[0]);
+        KernargPreloadInfo.ByteOffset = 2;
+        continue;
+      }
+
+      if (Alignment < 4)
+        Padding -= 1;
+
+      // Byte offset for data in preload SGPRs.
+      unsigned ByteOffset = alignTo(ArgOffset, 4) - ArgOffset;
+      SmallVectorImpl<MCRegister> *PreloadRegs =
+          Info.addPreloadedKernArg(TRI, &AMDGPU::SReg_32RegClass, 1,
+                                   ImplicitArgIdx, Padding, ByteOffset);
+      Register Reg = (*PreloadRegs)[0];
+      assert(Reg);
+      MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+      CCInfo.AllocateReg(Reg);
+      AllocatedSGPRs += Padding + 1;
+      LastArgIdx = ImplicitArgIdx;
+    }
+  }
 }
 
 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
@@ -8445,6 +8536,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_addrspacecast_nonnull:
     return lowerADDRSPACECAST(Op, DAG);
+  case Intrinsic::amdgcn_preload_implicitarg: {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    assert(ST.hasKernargPreload());
+    SDLoc DL(Op);
+    unsigned ImplictArgIdx =
+        Op.getConstantOperandVal(1) + MF.getFunction().arg_size();
+
+    auto &ArgInfo =
+        MFI->getArgInfo().PreloadKernArgs.find(ImplictArgIdx)->getSecond();
+    Register Reg = ArgInfo.Regs[0];
+    unsigned ByteOffset = ArgInfo.ByteOffset;
+    Register VReg = MRI.getLiveInVirtReg(Reg);
+    SDValue Preload =
+        DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i32);
+    auto VT = Op.getSimpleValueType();
+    assert(VT == MVT::i32 || VT == MVT::i16);
+    if (VT == MVT::i16) {
+      if (ByteOffset == 0) {
+        Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Preload);
+      } else {
+        SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32);
+        SDValue Extract =
+            DAG.getNode(ISD::SRL, DL, MVT::i32, Preload, ShiftAmt);
+
+        Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Extract);
+      }
+    }
+
+    return Preload;
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 2569f40fec0e48..ccfa97bebbc60a 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -42,7 +42,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
       PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
       WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
-      GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+      PreloadImplicitArgs(false), GITPtrHigh(0xffffffff),
+      HighBitsOf32BitAddress(0) {
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
@@ -60,6 +61,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   if (IsKernel) {
     WorkGroupIDX = true;
     WorkItemIDX = true;
+    if (F.hasFnAttribute("amdgpu-preload-implicitargs"))
+      PreloadImplicitArgs = true;
+
   } else if (CC == CallingConv::AMDGPU_PS) {
     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
   }
@@ -247,7 +251,8 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
 
 SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
     const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
-    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
+    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+    unsigned ByteOffset) {
   assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
          "Preload kernel argument allocated twice.");
   NumUserSGPRs += PaddingSGPRs;
@@ -256,6 +261,7 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   // merge them.
   Register PreloadReg =
       TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
+  ArgInfo.PreloadKernArgs[KernArgIdx].ByteOffset = ByteOffset;
   if (PreloadReg &&
       (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
     ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg);
@@ -272,6 +278,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
 }
 
+bool SIMachineFunctionInfo::allocateUserSGPRs(
+    unsigned Number) {
+  if (Number <= getNumUserSGPRs())
+    return false;
+
+  NumUserSGPRs = Number;
+  return true;
+}
+
 void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
                                              uint64_t Size, Align Alignment) {
   // Skip if it is an entry function or the register is already added.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7d0c1ba8448e6c..b1c9b1f493f3fc 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -461,6 +461,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // user arguments. This is an offset from the KernargSegmentPtr.
   bool ImplicitArgPtr : 1;
 
+  // Flag is true if a kernel preloads any implicit arguments.
+  bool PreloadImplicitArgs : 1;
+
   bool MayNeedAGPRs : 1;
 
   // The hard-wired high half of the address of the global information table
@@ -743,8 +746,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   Register addLDSKernelId();
   SmallVectorImpl<MCRegister> *
   addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
-                      unsigned AllocSizeDWord, int KernArgIdx,
-                      int PaddingSGPRs);
+                      unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+                      unsigned Offset = 0);
+
+  /// Reserve up to \p Number of user SGPRs.
+  bool allocateUserSGPRs(unsigned Number);
 
   /// Increment user SGPRs used for padding the argument list only.
   Register addReservedUserSGPR() {
@@ -840,6 +846,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return ImplicitArgPtr;
   }
 
+  bool hasPreloadImplicitArgs() const {
+    return PreloadImplicitArgs;
+  }
+
   AMDGPUFunctionArgInfo &getArgInfo() {
     return ArgInfo;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 72f10ea892e53f..b3a96552895122 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -7,6 +7,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
 
+; TODO: Add tests for lowering of group_size intrinsics.
+
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll
new file mode 100644
index 00000000000000..3eb5aac70e7ba2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll
@@ -0,0 +1,2310 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD %s
+
+define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0xc
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s10
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-NEXT:    s_add_i32 s0, s6, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-NEXT:    s_add_i32 s0, s10, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  %ext = zext i8 %val to i32
+  %add = add i32 %load, %ext
+  store i32 %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s2, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
+  %load_x = load i32, ptr addrspace(4) %gep_x
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load_y = load i32, ptr addrspace(4) %gep_y
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load_z = load i32, ptr addrspace(4) %gep_z
+  %ins.0 =  insertelement <3 x i32> undef, i32 %load_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[0:1] offset:20
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:24
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[4:5] offset:20
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:24
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s1, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[0:1] offset:26
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:30
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:    s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s1, s9, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s4, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[4:5] offset:26
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:30
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:    s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s1, s13, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}



More information about the llvm-commits mailing list