[llvm] [AMDGPU][NFC] Fix preload-kernarg.ll test after attributor move (PR #98840)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 14 14:59:55 PDT 2024
https://github.com/kerbowa created https://github.com/llvm/llvm-project/pull/98840
Update was to stale version of the test with missing functions and extra runlines that had been removed.
>From 63fd18493fe09e31ed462098180db613f43e2a0f Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Sun, 14 Jul 2024 14:54:38 -0700
Subject: [PATCH] [AMDGPU][NFC] Fix preload-kernarg.ll test after attributor
move
Update was to stale version of the test with missing functions and extra
runlines that had been removed.
---
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 3094 ++++++++++--------
1 file changed, 1721 insertions(+), 1373 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index a547c258e3921..f5c097f010eb8 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,18 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=2 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=8 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=2 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=8 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
-define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -23,51 +19,27 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 {
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -78,56 +50,32 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 {
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -138,51 +86,29 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -193,56 +119,34 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -253,51 +157,27 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -308,56 +188,32 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -367,47 +223,25 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -417,52 +251,30 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 {
-; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10
; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x0
@@ -474,55 +286,29 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s5, s4
+; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s2, s0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s5, s4
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0
@@ -534,60 +320,34 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s6, s0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -600,59 +360,33 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16
; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16
; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff
; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -665,56 +399,30 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16
+; GFX90a-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
@@ -723,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
ret void
}
-define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -734,47 +442,29 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -784,52 +474,34 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 {
-; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -843,63 +515,37 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0)
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
@@ -913,59 +559,33 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
+; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3]
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
+; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3]
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
; GFX90a-PRELOAD-8-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
}
-define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0
@@ -995,83 +615,47 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: v8i32_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_nop 1
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i32_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
+; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_nop 1
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: v8i32_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_nop 1
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i32_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
+; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_nop 1
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1090,87 +674,51 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
+; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX90a-PRELOAD-2-NEXT: s_nop 0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
+; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX90a-PRELOAD-8-NEXT: s_nop 0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
@@ -1181,51 +729,29 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
@@ -1236,55 +762,33 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1296,55 +800,29 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1356,59 +834,33 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1420,55 +872,29 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8
; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1480,59 +906,33 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
@@ -1543,51 +943,43 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-2-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
@@ -1598,55 +990,47 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9
+; GFX90a-PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -1668,95 +1052,53 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: v5f64_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_nop 1
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5f64_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
+; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_nop 1
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: v5f64_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_nop 1
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5f64_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
+; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_nop 1
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -1778,99 +1120,57 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
+; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX90a-PRELOAD-2-NEXT: s_nop 0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
+; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX90a-PRELOAD-8-NEXT: s_nop 0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
}
-define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
@@ -1879,43 +1179,57 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 24
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 16
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
-; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
; GFX90a-NO-PRELOAD: ; %bb.0:
; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
@@ -1924,46 +1238,58 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) #0 {
+define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) {
; GFX940-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
@@ -1974,44 +1300,22 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
@@ -2024,50 +1328,28 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
}
-define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) #0 {
+define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) {
; GFX940-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
; GFX940-NO-PRELOAD: ; %bb.0:
; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
@@ -2078,44 +1360,22 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
-; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_endpgm
;
; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
@@ -2128,47 +1388,1135 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
-; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX90a-PRELOAD-8-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
}
-attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) {
+; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store half %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) {
+; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store bfloat %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store <2 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store <3 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store <6 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s10, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10
+; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s7
+; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s11
+; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s10, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[10:11] offset:12
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store half %in, ptr addrspace(1) %out
+ store <7 x bfloat> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) {
+; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 1
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 1
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store i1 %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) {
+; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store fp128 %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-2-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-2-NEXT: global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-8-NEXT: global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store <7 x i8> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v1, s[2:3] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-2-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v1, s[6:7] offset:12
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store <7 x half> %in, ptr addrspace(1) %out
+ ret void
+}
+
+; Test when previous argument was not dword aligned.
+define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s7
+; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0xc
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s3
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0xc
+; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store i32 %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s7, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s7
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, s3
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store <3 x i32> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store i16 %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT: ; %bb.0:
+; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT: ; %bb.0:
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT: global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s8
+; GFX90a-PRELOAD-8-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT: global_store_short v1, v0, s[10:11]
+; GFX90a-PRELOAD-8-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store <2 x i8> %in2, ptr addrspace(1) %out2
+ ret void
+}
More information about the llvm-commits
mailing list