[llvm] [AMDGPU] Add support for preloading implicit kernel arguments (PR #83817)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 31 23:40:52 PDT 2024


https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/83817

>From eaf39518d307cee96d160d1a3d5902db331f9f06 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Mon, 4 Mar 2024 01:08:55 -0800
Subject: [PATCH] [AMDGPU] Add support for preloading implicit kernel arguments

Implicit arguments may be preloaded into User SGPRs via the same
mechanism as explicit arguments if their offsets within the kernarg
segment fall within the range of available registers. Lowering of these
implicit arguments may happen early so the implementation here follows
the same concept and is mostly agnostic to which values are being
loaded, and instead only cares about offsets from the implicitarg
pointer and the size of the values being used. Unlike preloading of
explicit arguments there are not restrictions on exactly which implicit
arguments are used and whether there is a unbroken sequence of used
arguments, but instead this will attempt to preload anything that falls
within the range of available User SGPRs on the target HW.

A limitation of this patch is that it only supports i16/i32 arguments,
but like other details of preloading kernargs for both implicit and
explicit arguments this is likely to be expanded and changed in the near
future.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   8 +
 .../Target/AMDGPU/AMDGPUArgumentUsageInfo.h   |   4 +
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |   1 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  30 +
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   3 +
 .../AMDGPU/AMDGPULowerKernelArguments.cpp     |  92 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 132 +++-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   5 +
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  19 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  14 +-
 .../GlobalISel/preload-implict-kernargs.ll    | 748 ++++++++++++++++++
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   2 +
 .../AMDGPU/preload-implict-kernargs.ll        | 698 ++++++++++++++++
 13 files changed, 1747 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/preload-implict-kernargs.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bda3b066b77636..34a11d23cfdced 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3210,4 +3210,12 @@ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
   [llvm_anyptr_ty], [llvm_anyptr_ty],
   [IntrNoMem, IntrSpeculatable]
 >;
+
+// This intrinsic is used to track the user SGPRs that hold implicit kernel
+// arguments. The i32 argument for this intrinsic represents the offset in
+// bytes from the value pointed to by the implicitarg pointer.
+def int_amdgcn_preload_implicitarg :
+  DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_i32_ty],
+  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<0>>]
+>;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 42b33c50d9f8c4..2434c7a1f0ee2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -76,6 +76,10 @@ struct ArgDescriptor {
     return StackOffset;
   }
 
+  void setMask(unsigned Mask) {
+    this->Mask = Mask;
+  }
+
   unsigned getMask() const {
     return Mask;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7e1f041fa10933..75e577658ef461 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -522,6 +522,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
 
   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
+  TLI.allocatePreloadImplicitKernArgSGPRs(CCInfo, MF, *TRI, *Info);
 
   unsigned i = 0;
   const Align KernArgBaseAlign(16);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e55d1de01b4fd1..425e4148739534 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5495,6 +5495,34 @@ bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizePreloadImplicitarg(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+  assert(ST.hasKernargPreload());
+  MachineFunction &MF = B.getMF();
+  Register OrigReg = MI.getOperand(0).getReg();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  // A unique identifier defined as the offset from start of implicit args added
+  // to the number of formal args.
+  unsigned ImplictArgIdx =
+      MI.getOperand(2).getImm() + MF.getFunction().arg_size();
+  auto &ArgDesc =
+      MFI->getArgInfo().PreloadKernArgs.find(ImplictArgIdx)->getSecond();
+  assert(ArgDesc.Regs.size() == 1);
+  Register Reg = ArgDesc.Regs[0];
+  ArgDescriptor Arg = ArgDescriptor::createRegister(Reg, ArgDesc.getMask());
+  Register Dst = MRI.createGenericVirtualRegister(LLT::scalar(32));
+  loadInputValue(Dst, B, &Arg, &AMDGPU::SReg_32RegClass, LLT::scalar(32));
+  if (MRI.getType(OrigReg) != LLT::scalar(32)) {
+    assert(MRI.getType(OrigReg) == LLT::scalar(16));
+    B.buildTrunc(OrigReg, Dst);
+  } else {
+    B.buildCopy(OrigReg, Dst);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
                                               MachineRegisterInfo &MRI,
                                               MachineIRBuilder &B,
@@ -7128,6 +7156,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_lds_kernel_id:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
+  case Intrinsic::amdgcn_preload_implicitarg:
+    return legalizePreloadImplicitarg(MI, MRI, B);
   case Intrinsic::amdgcn_dispatch_ptr:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index e5ba84a74a0f8a..44e0c2946ba323 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -188,6 +188,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
 
+  bool legalizePreloadImplicitarg(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                  MachineIRBuilder &B) const;
+
   bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B, unsigned AddrSpace) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index bc58407a73294c..ec36dd257b25e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -13,8 +13,10 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Target/TargetMachine.h"
@@ -32,8 +34,6 @@ class PreloadKernelArgInfo {
   unsigned NumFreeUserSGPRs;
 
 public:
-  SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
-
   PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
     setInitialFreeUserSGPRsCount();
   }
@@ -64,6 +64,86 @@ class PreloadKernelArgInfo {
     NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
     return true;
   }
+
+  // Try to allocate SGPRs to preload implicit kernel arguments.
+  void tryAllocImplicitArgPreloadSGPRs(unsigned ImplicitArgsBaseOffset,
+                                       IRBuilder<> &Builder) {
+    IntrinsicInst *ImplicitArgPtr = nullptr;
+    for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+      for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+        if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(I))
+          if (CI->getIntrinsicID() == Intrinsic::amdgcn_implicitarg_ptr) {
+            ImplicitArgPtr = CI;
+            break;
+          }
+      }
+    }
+
+    if (!ImplicitArgPtr)
+      return;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    // Pair is the load and the load offset.
+    SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
+    for (auto *U : ImplicitArgPtr->users()) {
+      if (!U->hasOneUse())
+        continue;
+
+      int64_t Offset = 0;
+      auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
+      if (!Load) {
+        if (GetPointerBaseWithConstantOffset(U, Offset, DL) != ImplicitArgPtr)
+          continue;
+        Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+      }
+
+      if (!Load || !Load->isSimple())
+        continue;
+
+      // FIXME: Expand to handle 64-bit implicit args and large merged loads.
+      if (Load->getType() != Builder.getInt16Ty() &&
+          Load->getType() != Builder.getInt32Ty())
+        continue;
+
+      ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
+    }
+
+    if (ImplicitArgLoads.empty())
+      return;
+
+    // Allocate loads in order of offset. We need to be sure that the implicit
+    // argument can actually be preloaded.
+    std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
+              [](const std::pair<LoadInst *, unsigned> &A,
+                 const std::pair<LoadInst *, unsigned> &B) {
+                return A.second < B.second;
+              });
+
+    unsigned LastExplicitArgOffset = ImplicitArgsBaseOffset;
+    bool HasPreloadImplicitArgs = false;
+    for (const auto &Load : ImplicitArgLoads) {
+      LoadInst *LoadInst = Load.first;
+      Type *LoadType = LoadInst->getType();
+      auto LoadOffset = Load.second;
+      unsigned LoadSize = DL.getTypeStoreSize(LoadType);
+      // If we fail to preload any implicit argument we know we don't have SGPRs
+      // to preload any subsequent ones with larger offsets.
+      if (!tryAllocPreloadSGPRs(LoadSize, LoadOffset + ImplicitArgsBaseOffset,
+                                LastExplicitArgOffset))
+        break;
+
+      HasPreloadImplicitArgs = true;
+      LastExplicitArgOffset = LoadOffset + LoadSize;
+      llvm::Value *LoadOffsetValue =
+          llvm::ConstantInt::get(Builder.getInt32Ty(), LoadOffset);
+      CallInst *PreloadIntrin = Builder.CreateIntrinsic(
+          Intrinsic::amdgcn_preload_implicitarg, {LoadType}, {LoadOffsetValue});
+      LoadInst->replaceAllUsesWith(PreloadIntrin);
+    }
+
+    if (HasPreloadImplicitArgs)
+      F.addFnAttr("amdgpu-preload-implicitargs");
+  }
 };
 
 class AMDGPULowerKernelArguments : public FunctionPass {
@@ -282,6 +362,14 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
   KernArgSegment->addRetAttr(
       Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
+  if (InPreloadSequence) {
+    uint64_t ImplicitArgsBaseOffset =
+        alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
+        BaseOffset;
+    PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
+                                                Builder);
+  }
+
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a4370de0613b3..c8d39b7bfaac65 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2408,6 +2408,97 @@ void SITargetLowering::allocateSpecialInputSGPRs(
     allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
 }
 
+void SITargetLowering::allocatePreloadImplicitKernArgSGPRs(
+    CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
+    SIMachineFunctionInfo &Info) const {
+  if (!Info.hasPreloadImplicitArgs())
+    return;
+
+  Function &F = MF.getFunction();
+  const Align KernelArgBaseAlign = Align(16);
+  unsigned ImplicitArgsBaseOffset =
+      getImplicitParameterOffset(MF, FIRST_IMPLICIT);
+  unsigned ImplicitArgBaseSGPROffset = alignTo(ImplicitArgsBaseOffset, 4) / 4;
+  assert(ImplicitArgBaseSGPROffset <
+         AMDGPU::getMaxNumUserSGPRs(MF.getSubtarget()));
+
+  // Search for implicitarg preload intrinsics.
+  // FIXME: Should we use metadata to track the used implicitarg offsets?
+  SmallVector<std::pair<Type *, unsigned>, 4> ImplicitArgPreloadIntrins;
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (auto *CI = dyn_cast<CallInst>(&Inst)) {
+        if (CI->getIntrinsicID() == Intrinsic::amdgcn_preload_implicitarg) {
+          unsigned Offset =
+              cast<ConstantInt>(CI->getArgOperand(0))->getZExtValue();
+          ImplicitArgPreloadIntrins.push_back(
+              std::make_pair(CI->getType(), Offset));
+        }
+      }
+    }
+  }
+
+  // Sort ImplicitArgPreloadIntrins by offset.
+  std::sort(
+      ImplicitArgPreloadIntrins.begin(), ImplicitArgPreloadIntrins.end(),
+      [](const std::pair<Type *, unsigned> &A,
+         const std::pair<Type *, unsigned> &B) { return A.second < B.second; });
+
+  // Allocate padding user SGPRs up to the beginning of the implicit
+  // arguments.
+  Info.allocateUserSGPRs(ImplicitArgBaseSGPROffset +
+                         (Info.getUserSGPRInfo().getNumUsedUserSGPRs() -
+                          Info.getUserSGPRInfo().getNumKernargPreloadSGPRs()));
+
+  // Finally allocate SGPRs for preloaded implicit arguments.
+  unsigned AllocatedSGPRs = ImplicitArgBaseSGPROffset;
+  unsigned LastArgIdx = 0;
+  for (auto &ImplicitArg : ImplicitArgPreloadIntrins) {
+    unsigned ImplicitArgOffset = ImplicitArg.second;
+    Type *Ty = ImplicitArg.first;
+    // Offset from start of kernel args.
+    unsigned ArgOffset = ImplicitArgsBaseOffset + ImplicitArgOffset;
+    // The offset serves as a unique index to track the regsiter that the
+    // implicit argument is loaded to. Not expecting duplicate loads with the
+    // same offset.
+    unsigned ImplicitArgIdx = F.arg_size() + ImplicitArgOffset;
+    // FIXME: Is it valid to have multiple loads with the same offset
+    // relative to the implicitarg pointer?
+    assert(!Info.getArgInfo().PreloadKernArgs.count(ArgOffset));
+    Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+    unsigned Padding = alignTo(ArgOffset, 4) / 4 - AllocatedSGPRs;
+    if (Padding == 0 && Alignment < 4) {
+      // Argument is preloaded into the previous SGPR.
+      auto &KernargPreloadInfo =
+          Info.getArgInfo().PreloadKernArgs[ImplicitArgIdx];
+      KernargPreloadInfo.Regs.push_back(
+          Info.getArgInfo().PreloadKernArgs[LastArgIdx].Regs[0]);
+      KernargPreloadInfo.setMask(0xffff0000u);
+      continue;
+    }
+
+    // Handle the case where the argument in not in the previously allocated
+    // SGPR but the alignment is < 4. This can happen, for example, if the
+    // first used implicit argument is not dword aligned.
+    if (Alignment < 4)
+      Padding -= 1;
+
+    // Byte offset for data in preload SGPRs.
+    unsigned ByteOffset = alignTo(ArgOffset, 4) - ArgOffset;
+    unsigned Mask = Ty == Type::getInt16Ty(F.getContext()) ? 0xffffu : ~0u;
+    Mask <<= ByteOffset * 8;
+    SmallVectorImpl<MCRegister> *PreloadRegs = Info.addPreloadedKernArg(
+        TRI, &AMDGPU::SReg_32RegClass, 1, ImplicitArgIdx, Padding, Mask);
+    Register Reg = (*PreloadRegs)[0];
+    assert(Reg);
+    const LLT S32 = LLT::scalar(32);
+    MF.getRegInfo().setType(MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass), S32);
+    CCInfo.AllocateReg(Reg);
+    AllocatedSGPRs += Padding + 1;
+    LastArgIdx = ImplicitArgIdx;
+  }
+}
+
 // Allocate special inputs passed in user SGPRs.
 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
                                             MachineFunction &MF,
@@ -2466,8 +2557,8 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
   // these from the dispatch pointer.
 }
 
-// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
-// sequential starting from the first argument.
+// Allocate pre-loaded kernel arguments. Preloaded arguments must be
+// sequential and preloading must also start from the first argument.
 void SITargetLowering::allocatePreloadKernArgSGPRs(
     CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
     const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
@@ -2478,6 +2569,7 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
   GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
   bool InPreloadSequence = true;
   unsigned InIdx = 0;
+  const Align KernelArgBaseAlign = Align(16);
   for (auto &Arg : F.args()) {
     if (!InPreloadSequence || !Arg.hasInRegAttr())
       break;
@@ -2494,7 +2586,6 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
          InIdx++) {
       assert(ArgLocs[ArgIdx].isMemLoc());
       auto &ArgLoc = ArgLocs[InIdx];
-      const Align KernelArgBaseAlign = Align(16);
       unsigned ArgOffset = ArgLoc.getLocMemOffset();
       Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
       unsigned NumAllocSGPRs =
@@ -2533,6 +2624,8 @@ void SITargetLowering::allocatePreloadKernArgSGPRs(
       LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
     }
   }
+
+  allocatePreloadImplicitKernArgSGPRs(CCInfo, MF, TRI, Info);
 }
 
 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
@@ -8452,6 +8545,39 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_addrspacecast_nonnull:
     return lowerADDRSPACECAST(Op, DAG);
+  case Intrinsic::amdgcn_preload_implicitarg: {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    assert(ST.hasKernargPreload());
+    SDLoc DL(Op);
+    unsigned ImplictArgIdx =
+        Op.getConstantOperandVal(1) + MF.getFunction().arg_size();
+
+    auto &ArgInfo =
+        MFI->getArgInfo().PreloadKernArgs.find(ImplictArgIdx)->getSecond();
+    Register Reg = ArgInfo.Regs[0];
+    Register VReg = MRI.getLiveInVirtReg(Reg);
+    SDValue Preload =
+        DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i32);
+    auto VT = Op.getSimpleValueType();
+    assert(VT == MVT::i32 || VT == MVT::i16);
+    if (VT == MVT::i16) {
+      assert(ArgInfo.isMasked());
+      unsigned Mask = ArgInfo.getMask();
+      if (Mask == 0xffff0000u) {
+        SDValue ShiftAmt = DAG.getConstant(16, DL, MVT::i32);
+        SDValue Extract =
+            DAG.getNode(ISD::SRL, DL, MVT::i32, Preload, ShiftAmt);
+
+        Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Extract);
+      } else {
+        assert(Mask == 0xffffu);
+        Preload = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Preload);
+      }
+    }
+
+    return Preload;
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 9856a2923d38f7..c218999c9e40b4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -585,6 +585,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
     const SIRegisterInfo &TRI,
     SIMachineFunctionInfo &Info) const;
 
+  void allocatePreloadImplicitKernArgSGPRs(CCState &CCInfo,
+                                        MachineFunction &MF,
+                                        const SIRegisterInfo &TRI,
+                                        SIMachineFunctionInfo &Info) const;
+
   void allocateSpecialInputVGPRs(CCState &CCInfo,
                                  MachineFunction &MF,
                                  const SIRegisterInfo &TRI,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 12433dc83c4892..53387b4bb27362 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -42,7 +42,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
       WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
       PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
       WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
-      GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
+      PreloadImplicitArgs(false), GITPtrHigh(0xffffffff),
+      HighBitsOf32BitAddress(0) {
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
@@ -60,6 +61,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   if (IsKernel) {
     WorkGroupIDX = true;
     WorkItemIDX = true;
+    if (F.hasFnAttribute("amdgpu-preload-implicitargs"))
+      PreloadImplicitArgs = true;
+
   } else if (CC == CallingConv::AMDGPU_PS) {
     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
   }
@@ -247,7 +251,8 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
 
 SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
     const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
-    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
+    unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+    unsigned Mask) {
   assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
          "Preload kernel argument allocated twice.");
   NumUserSGPRs += PaddingSGPRs;
@@ -256,6 +261,7 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   // merge them.
   Register PreloadReg =
       TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
+  ArgInfo.PreloadKernArgs[KernArgIdx].setMask(Mask);
   if (PreloadReg &&
       (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
     ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg);
@@ -272,6 +278,15 @@ SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
   return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
 }
 
+bool SIMachineFunctionInfo::allocateUserSGPRs(
+    unsigned Number) {
+  if (Number <= getNumUserSGPRs())
+    return false;
+
+  NumUserSGPRs = Number;
+  return true;
+}
+
 void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
                                              uint64_t Size, Align Alignment) {
   // Skip if it is an entry function or the register is already added.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 7d0c1ba8448e6c..bb5335c76593c8 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -461,6 +461,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // user arguments. This is an offset from the KernargSegmentPtr.
   bool ImplicitArgPtr : 1;
 
+  // Flag is true if a kernel preloads any implicit arguments.
+  bool PreloadImplicitArgs : 1;
+
   bool MayNeedAGPRs : 1;
 
   // The hard-wired high half of the address of the global information table
@@ -743,8 +746,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   Register addLDSKernelId();
   SmallVectorImpl<MCRegister> *
   addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
-                      unsigned AllocSizeDWord, int KernArgIdx,
-                      int PaddingSGPRs);
+                      unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs,
+                      unsigned Mask = ~0u);
+
+  /// Reserve up to \p Number of user SGPRs.
+  bool allocateUserSGPRs(unsigned Number);
 
   /// Increment user SGPRs used for padding the argument list only.
   Register addReservedUserSGPR() {
@@ -840,6 +846,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return ImplicitArgPtr;
   }
 
+  bool hasPreloadImplicitArgs() const {
+    return PreloadImplicitArgs;
+  }
+
   AMDGPUFunctionArgInfo &getArgInfo() {
     return ArgInfo;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/preload-implict-kernargs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/preload-implict-kernargs.ll
new file mode 100644
index 00000000000000..d76b3747d257b1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/preload-implict-kernargs.ll
@@ -0,0 +1,748 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -global-isel -amdgpu-kernarg-preload-count=16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -global-isel -amdgpu-kernarg-preload-count=16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD %s
+
+define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0xc
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s5
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s9
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-NEXT:    s_add_i32 s0, s6, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-PRELOAD-NEXT:    s_add_i32 s2, s10, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  %ext = zext i8 %val to i32
+  %add = add i32 %load, %ext
+  store i32 %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s2, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_mov_b32 s0, s6
+; GFX940-NO-PRELOAD-NEXT:    s_mov_b32 s1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_mov_b32 s4, s2
+; GFX90a-NO-PRELOAD-NEXT:    s_mov_b32 s5, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
+  %load_x = load i32, ptr addrspace(4) %gep_x
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load_y = load i32, ptr addrspace(4) %gep_y
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load_z = load i32, ptr addrspace(4) %gep_z
+  %ins.0 =  insertelement <3 x i32> undef, i32 %load_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_and_b32 s2, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s2, s7, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s2, s11, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_and_b32 s2, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v0, v3, s[0:1] offset:20
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v1, v3, s[0:1] offset:22
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:24
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s2, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v0, v3, s[4:5] offset:20
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v1, v3, s[4:5] offset:22
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:24
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s1, s11, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s2, s8, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s2, s12, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_and_b32 s2, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s2, s9, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s2, s13, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v0, v3, s[0:1] offset:26
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v1, v3, s[0:1] offset:28
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:30
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s1, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s2, s9, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v0, v3, s[4:5] offset:26
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v1, v3, s[4:5] offset:28
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:30
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s1, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s2, s13, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 72f10ea892e53f..b3a96552895122 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -7,6 +7,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
 
+; TODO: Add tests for lowering of group_size intrinsics.
+
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll
new file mode 100644
index 00000000000000..e74b7ae07f7e26
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/preload-implict-kernargs.ll
@@ -0,0 +1,698 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD %s
+
+define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0xc
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s10
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load = load i32, ptr addrspace(4) %gep
+  store i32 %load, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-NEXT:    s_add_i32 s0, s6, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-NEXT:    s_add_i32 s0, s10, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load = load i32, ptr addrspace(4) %imp_arg_ptr
+  %ext = zext i8 %val to i32
+  %add = add i32 %load, %ext
+  store i32 %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s2, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
+  %load_x = load i32, ptr addrspace(4) %gep_x
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+  %load_y = load i32, ptr addrspace(4) %gep_y
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+  %load_z = load i32, ptr addrspace(4) %gep_z
+  %ins.0 =  insertelement <3 x i32> undef, i32 %load_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[0:1] offset:20
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:24
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s1, s7, 0xffff
+; GFX940-PRELOAD-NEXT:    s_and_b32 s4, s8, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[4:5] offset:20
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:24
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s1, s11, 0xffff
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s12, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s0, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s13, 16
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load = load i16, ptr addrspace(4) %gep
+  %conv = zext i16 %load to i32
+  store i32 %conv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[0:1] offset:26
+; GFX940-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[0:1] offset:30
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX940-PRELOAD-NEXT:    s_lshr_b32 s1, s9, 16
+; GFX940-PRELOAD-NEXT:    s_and_b32 s4, s9, 0xffff
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    global_load_dword v0, v3, s[4:5] offset:26
+; GFX90a-NO-PRELOAD-NEXT:    global_load_ushort v2, v3, s[4:5] offset:30
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-PRELOAD:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX90a-PRELOAD-NEXT:    s_lshr_b32 s1, s13, 16
+; GFX90a-PRELOAD-NEXT:    s_and_b32 s2, s13, 0xffff
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT:    s_endpgm
+  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+  %load_x = load i16, ptr addrspace(4) %gep_x
+  %conv_x = zext i16 %load_x to i32
+  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+  %load_y = load i16, ptr addrspace(4) %gep_y
+  %conv_y = zext i16 %load_y to i32
+  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+  %load_z = load i16, ptr addrspace(4) %gep_z
+  %conv_z = zext i16 %load_z to i32
+  %ins.0 =  insertelement <3 x i32> undef, i32 %conv_x, i32 0
+  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+  store <3 x i32> %ins.2, ptr addrspace(1) %out
+  ret void
+}



More information about the llvm-commits mailing list