[llvm] [AMDGPU] Enable kernel arg preloading with gfx90a (PR #81180)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Sun Feb 11 09:49:11 PST 2024


https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/81180

>From 86e67e2a81d423edf1e99af494519c74c0a94a17 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Thu, 8 Feb 2024 11:21:10 -0800
Subject: [PATCH] [AMDGPU] Enable kernel arg preloading with gfx90a

Add a trap instruction to the beginning of the kernel prologue to handle
cases where preloading is attempted on HW loaded with incompatible
firmware.
---
 llvm/docs/AMDGPUUsage.rst                     |     5 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |     3 +-
 .../AMDGPU/AMDGPULowerKernelArguments.cpp     |     1 -
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |     6 -
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |    16 +-
 .../MCTargetDesc/AMDGPUTargetStreamer.h       |     9 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |     3 +-
 .../CodeGen/AMDGPU/preload-kernarg-header.ll  |     9 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  | 15978 +++++++++++-----
 9 files changed, 10685 insertions(+), 5345 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6b2417143ca06c..bee237ad77691a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5366,7 +5366,10 @@ additional 256 bytes to the kernel_code_entry_byte_offset. This addition
 facilitates the incorporation of a prologue to the kernel entry to handle cases
 where code designed for kernarg preloading is executed on hardware equipped with
 incompatible firmware. If hardware has compatible firmware the 256 bytes at the
-start of the kernel entry will be skipped.
+start of the kernel entry will be skipped. Additionally, the compiler backend
+may insert a trap instruction at the start of the kernel prologue to manage
+situations where kernarg preloading is attempted on hardware with incompatible
+firmware.
 
 .. _amdgpu-amdhsa-kernel-prolog:
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index db81e1ee9e3899..886d855e227a2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -197,7 +197,8 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
 
   if (MFI.getNumKernargPreloadedSGPRs() > 0) {
     assert(AMDGPU::hasKernargPreload(STM));
-    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
+    getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(),
+                                                  STM.isAmdHsaOS());
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 015c71080d6701..bc58407a73294c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -145,7 +145,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
 
     // Try to preload this argument into user SGPRs.
     if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
-        !ST.needsKernargPreloadBackwardsCompatibility() &&
         !Arg.getType()->isAggregateType())
       if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
                                            LastExplicitArgOffset))
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4f8eeaaf500b4d..ba633fa9e9cb4b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1254,12 +1254,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // \returns true if preloading kernel arguments is supported.
   bool hasKernargPreload() const { return KernargPreload; }
 
-  // \returns true if we need to generate backwards compatible code when
-  // preloading kernel arguments.
-  bool needsKernargPreloadBackwardsCompatibility() const {
-    return hasKernargPreload() && !hasGFX940Insts();
-  }
-
   // \returns true if the target has split barriers feature
   bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 5e9b1674d87dcb..61f4a94019efb0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -756,18 +756,26 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
 }
 
 bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI) {
-  for (int i = 0; i < 64; ++i) {
+    const MCSubtargetInfo &STI, bool TrapEnabled) {
+  const char *TrapInstr = TrapEnabled ? "\ts_trap 2" : "\ts_endpgm";
+  OS << TrapInstr
+     << " ; Trap with incompatible firmware that doesn't "
+        "support preloading kernel arguments.\n";
+  for (int i = 0; i < 63; ++i) {
     OS << "\ts_nop 0\n";
   }
   return true;
 }
 
 bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
-    const MCSubtargetInfo &STI) {
+    const MCSubtargetInfo &STI, bool TrapEnabled) {
   const uint32_t Encoded_s_nop = 0xbf800000;
+  const uint32_t Encoded_s_trap = 0xbf920002;
+  const uint32_t Encoded_s_endpgm = 0xbf810000;
+  const uint32_t TrapInstr = TrapEnabled ? Encoded_s_trap : Encoded_s_endpgm;
   MCStreamer &OS = getStreamer();
-  for (int i = 0; i < 64; ++i) {
+  OS.emitInt32(TrapInstr);
+  for (int i = 0; i < 63; ++i) {
     OS.emitInt32(Encoded_s_nop);
   }
   return true;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index ad5f27a33fcbd1..5aa80ff578c6b6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -89,7 +89,8 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
 
   /// \returns True on success, false on failure.
-  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
+  virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                        bool TrapEnabled) {
     return true;
   }
 
@@ -146,7 +147,8 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                bool TrapEnabled) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
@@ -200,7 +202,8 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
   /// \returns True on success, false on failure.
-  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+  bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI,
+                                bool TrapEnabled) override;
 
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a64a9e608f2173..83221f7ead37e1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2826,8 +2826,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
-    if (IsKernel && Subtarget->hasKernargPreload() &&
-        !Subtarget->needsKernargPreloadBackwardsCompatibility())
+    if (IsKernel && Subtarget->hasKernargPreload())
       allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
 
     allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 75feac35dacd84..a70488a00db739 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,8 +1,11 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA %s
 
 ; GCN: preload_kernarg_header
-; GCN-COUNT-64: s_nop 0
+; HSA: s_trap 2
+; NON-HSA: s_endpgm
+; GCN-COUNT-63: s_nop 0
 define amdgpu_kernel void @preload_kernarg_header(ptr %arg) {
     store ptr %arg, ptr %arg
     ret void
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 57980214e58e2b..d20c3a4007ffdd 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,1856 +1,3681 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-8 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
 
 define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i8:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i8:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i8:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i8:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i8:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i8:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i8:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i8:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i8:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
-; NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i32 %arg0, ptr addrspace(1) %out
   ret void
 }
 
-; Check alignment on the second preloaded arg.
 
 define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
-; NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s5, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s2, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s6, s2
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s6, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s6, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %add = add i32 %arg0, %arg1
   store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
-; NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
-; NO-PRELOAD-NEXT:    s_and_b32 s1, s4, 0xffff
-; NO-PRELOAD-NEXT:    s_add_i32 s0, s1, s0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
-; PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
-; PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX940-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT:    s_add_i32 s2, s2, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX90a-PRELOAD-2-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT:    s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %ext = zext i16 %arg0 to i32
   %ext1 = zext i16 %arg1 to i32
   %add = add i32 %ext, %ext1
@@ -1859,3563 +3684,7068 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 }
 
 define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
-; NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
-; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <2 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-; Don't try to preload byref args.
 
 define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
-; NO-PRELOAD-LABEL: byref_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
-; NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
-; NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: byref_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: byref_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: byref_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: byref_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   %in = load i32, ptr addrspace(4) %in.byref
   store volatile i32 %in, ptr addrspace(1) %out, align 4
   store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; TODO: Should do partial preload in cases like these where only part of the arg
-; can be preloaded.
 
 define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
-; NO-PRELOAD-LABEL: v8i32_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; NO-PRELOAD-NEXT:    s_nop 1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v8i32_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-1-NEXT:    s_nop 1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v8i32_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-2-NEXT:    s_nop 1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v8i32_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-4-NEXT:    s_nop 1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v8i32_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-8-NEXT:    s_nop 1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_nop 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v8i32_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_nop 1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v8i32_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_nop 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v8i32_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_nop 1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v8i32_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_nop 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX90a-NO-PRELOAD-NEXT:    s_nop 0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
-; NO-PRELOAD-LABEL: v3i16_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3i16_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3i16_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3i16_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3i16_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
-; PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
-; NO-PRELOAD-LABEL: v3i32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3i32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3i32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3i32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3i32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
-; NO-PRELOAD-LABEL: v3f32_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v3f32_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v3f32_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v3f32_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v3f32_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
-; PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
-; NO-PRELOAD-LABEL: v5i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
-; NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v5i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
-; PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v5i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v5i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v5i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
-; PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
-; NO-PRELOAD-LABEL: v5f64_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
-; NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; NO-PRELOAD-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; NO-PRELOAD-NEXT:    s_nop 1
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
-; NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v5f64_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-1-NEXT:    s_nop 1
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v5f64_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-2-NEXT:    s_nop 1
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v5f64_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-4-NEXT:    s_nop 1
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v5f64_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
-; PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
-; PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; PRELOAD-8-NEXT:    s_nop 1
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
-; PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_nop 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v5f64_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_nop 1
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v5f64_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_nop 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v5f64_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_nop 1
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v5f64_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-PRELOAD-8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_nop 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX90a-NO-PRELOAD-NEXT:    s_nop 0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
-; NO-PRELOAD-LABEL: v8i8_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: v8i8_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: v8i8_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: v8i8_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: v8i8_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
-; PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store <8 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) {
-; NO-PRELOAD-LABEL: i64_kernel_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store i64 %a, ptr addrspace(1) %out, align 8
   ret void
 }
 
 define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) {
-; NO-PRELOAD-LABEL: f64_kernel_preload_arg:
-; NO-PRELOAD:       ; %bb.0:
-; NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
-; NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
-; NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
-; NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; NO-PRELOAD-NEXT:    s_endpgm
-;
-; PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; PRELOAD-1:         s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:    s_nop 0
-; PRELOAD-1-NEXT:  ; %bb.0:
-; PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-1-NEXT:    s_endpgm
-;
-; PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; PRELOAD-2:         s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:    s_nop 0
-; PRELOAD-2-NEXT:  ; %bb.0:
-; PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-2-NEXT:    s_endpgm
-;
-; PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; PRELOAD-4:         s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:    s_nop 0
-; PRELOAD-4-NEXT:  ; %bb.0:
-; PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-4-NEXT:    s_endpgm
-;
-; PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; PRELOAD-8:         s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:    s_nop 0
-; PRELOAD-8-NEXT:  ; %bb.0:
-; PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
-; PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; PRELOAD-8-NEXT:    s_endpgm
+; GFX940-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:    s_nop 0
+; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:    s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:    s_nop 0
+; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:    s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-1:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:    s_nop 0
+; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-1-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:    s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-4:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:    s_nop 0
+; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-4-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:    s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
   store double %in, ptr addrspace(1) %out
   ret void
 }



More information about the llvm-commits mailing list