[llvm] [AMDGPU] Fix kernarg preloading crash with some types and alignments (PR #91625)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Sun May 19 22:00:24 PDT 2024


https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/91625

>From 5b715a6fa3e988a735c5a522f107cb6e8f301932 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Thu, 9 May 2024 02:16:53 -0700
Subject: [PATCH 1/2] [AMDGPU] Fix kernarg preloading crash with some types and
 alignments

Lowering of preloded arguments would fail with half/bfloat if they were
dword aligned in the kernarg segment and not part of a vector. Added
more tests with different alignments and types.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |   20 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 1920 +++++++++---------
 2 files changed, 920 insertions(+), 1020 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 89e83babcfef4..c7c4a8faa2fb0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2976,12 +2976,20 @@ SDValue SITargetLowering::LowerFormalArguments(
                                    DL, Elts);
           }
 
-          SDValue CMemVT;
-          if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
-            CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
-          else
-            CMemVT = DAG.getBitcast(MemVT, NewArg);
-          NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
+          // If the argument was preloaded to multiple consecutive 32-bit
+          // registers because of misalignment between addressable SGPR tuples
+          // and the argument size, we can still assume that because of kernarg
+          // segment alignment restrictions that NewArg's size is the same as
+          // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
+          // truncate since we cannot preload to less than a single SGPR and the
+          // MemVT may be smaller.
+          EVT MemVTInt =
+              EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+          if (MemVT.bitsLT(NewArg.getSimpleValueType()))
+            NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
+
+          NewArg = DAG.getBitcast(MemVT, NewArg);
+          NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
                                   Ins[i].Flags.isSExt(), &Ins[i]);
           NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
         }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index f0e709b5a1727..979bba938816c 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,18 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
 
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
 
-define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -23,19 +19,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8:
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -45,17 +29,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8:
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -65,7 +39,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -76,19 +50,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -98,17 +60,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -122,8 +74,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -134,19 +86,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -157,18 +97,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -179,7 +108,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -190,19 +119,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -213,18 +130,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -239,8 +145,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -251,19 +157,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -273,17 +167,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -293,7 +177,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -304,19 +188,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -326,17 +198,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -350,8 +212,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -361,18 +223,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -381,16 +232,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -399,7 +241,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -409,18 +251,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -429,16 +260,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -451,8 +273,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 }
 
 
-define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
-; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
@@ -464,20 +286,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -489,17 +298,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -509,7 +308,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x0
@@ -521,20 +320,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s6, s2
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -546,17 +332,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s6, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -570,8 +346,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -584,21 +360,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -612,19 +374,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -636,7 +386,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -649,21 +399,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -677,19 +413,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s1, s8, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -707,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
   ret void
 }
 
-define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -718,18 +442,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -740,18 +453,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -762,7 +464,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -772,18 +474,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -794,18 +485,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX90a-PRELOAD-2-NEXT:    global_store_short v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v1, v0, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -820,8 +500,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 }
 
 
-define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
-; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -835,22 +515,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -865,22 +530,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -895,7 +545,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
@@ -909,22 +559,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -939,22 +574,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 }
 
 
-define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
@@ -995,27 +615,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i32_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i32_arg:
+; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1035,27 +635,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i32_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i32_arg:
+; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1075,7 +655,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1094,27 +674,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1134,27 +694,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1177,8 +717,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
   ret void
 }
 
-define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1189,20 +729,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1213,18 +740,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1235,7 +751,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1246,20 +762,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1270,18 +773,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1295,8 +787,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1308,20 +800,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1332,18 +811,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1354,7 +822,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1366,20 +834,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1390,18 +845,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1415,8 +859,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1428,20 +872,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1452,18 +883,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1474,7 +894,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1486,20 +906,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1510,18 +917,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1535,8 +931,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1547,20 +943,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1578,25 +961,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1614,7 +979,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1625,20 +990,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1656,25 +1008,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s9
-; GFX90a-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1695,8 +1029,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
   ret void
 }
 
-define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -1718,30 +1052,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5f64_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5f64_arg:
+; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1764,30 +1075,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5f64_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5f64_arg:
+; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1810,7 +1098,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -1832,30 +1120,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1878,30 +1143,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1927,8 +1169,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
   ret void
 }
 
-define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
-; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -1937,18 +1179,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1973,32 +1204,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    s_nop 0
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -2023,7 +1229,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -2032,18 +1238,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -2067,31 +1262,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -2129,17 +1300,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2149,15 +1309,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2177,17 +1328,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2197,15 +1337,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2229,17 +1360,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2249,15 +1369,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2277,17 +1388,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2297,15 +1397,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2317,3 +1408,804 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
   store double %in, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) {
+; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) {
+; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) {
+; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i1 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) {
+; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store fp128 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x i8> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[2:3] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[6:7] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x half> %in, ptr addrspace(1) %out
+  ret void
+}
+
+; Test when previous argument was not dword aligned.
+define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0xc
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i32 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s7, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <3 x i32> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i16 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <2 x i8> %in2, ptr addrspace(1) %out2
+  ret void
+}

>From ad232813ac5dcfa449894f5db92f2037086287b1 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Thu, 16 May 2024 13:03:19 -0700
Subject: [PATCH 2/2] Add more bfloat tests.

---
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 311 +++++++++++++++++++
 1 file changed, 311 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 979bba938816c..857bb897ead2a 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1529,6 +1529,317 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bflo
   ret void
 }
 
+define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <2 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <3 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <6 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s10, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[10:11] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s10, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[10:11] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  store <7 x bfloat> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
 define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) {
 ; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:



More information about the llvm-commits mailing list