[llvm] 7d5281a - [AMDGPU][NFC] Fix preload-kernarg.ll test after attributor move (#98840)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 18 17:04:31 PDT 2024
Author: Austin Kerbow
Date: 2024-08-18T17:04:27-07:00
New Revision: 7d5281a66d5d42c65cfb9d95eaf9aa01afb089fb
URL: https://github.com/llvm/llvm-project/commit/7d5281a66d5d42c65cfb9d95eaf9aa01afb089fb
DIFF: https://github.com/llvm/llvm-project/commit/7d5281a66d5d42c65cfb9d95eaf9aa01afb089fb.diff
LOG: [AMDGPU][NFC] Fix preload-kernarg.ll test after attributor move (#98840)
Update was to stale version of the test with missing functions and extra
runlines that had been removed.
Added:
Modified:
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index a547c258e3921d..b7113a65607fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,721 +1,157 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a %s
-define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
+; GFX940-LABEL: ptr1_i8:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s4, 0xff
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: ptr1_i8:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zeroext inreg %arg0) #0 {
+; GFX940-LABEL: ptr1_i8_zext_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: ptr1_i8_zext_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_mov_b32 s0, 0xffff
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0) #0 {
+; GFX940-LABEL: ptr1_i16_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: ptr1_i16_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 inreg %arg0) #0 {
+; GFX940-LABEL: ptr1_i32_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: ptr1_i32_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) #0 {
-; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: s_add_i32 s0, s5, s4
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT: s_load_dword s5, s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s5, s4
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s3, s2
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspace(1) inreg %out, i32 inreg %arg1) #0 {
+; GFX940-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_add_i32 s0, s2, s6
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-NO-PRELOAD-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX940-NO-PRELOAD-NEXT: s_add_i32 s0, s1, s0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-1-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-1-NEXT: s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-2-NEXT: s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-NO-PRELOAD-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-1-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-1-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-2-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-2-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-4-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-4-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s3, s2, 16
-; GFX90a-PRELOAD-8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX90a-PRELOAD-8-NEXT: s_add_i32 s2, s2, s3
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #0 {
+; GFX940-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: s_and_b32 s1, s4, 0xffff
+; GFX940-NEXT: s_add_i32 s0, s1, s0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT: s_add_i32 s0, s1, s0
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
%add = add i32 %ext, %ext1
@@ -723,1452 +159,878 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
ret void
}
-define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dword s4, s[0:1], 0x8
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dword s2, s[4:5], 0x8
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 x i8> inreg %in) #0 {
+; GFX940-LABEL: ptr1_v2i8_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NEXT: global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-NEXT: global_store_short v1, v0, s[6:7]
+; GFX90a-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 {
-; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x100
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s2
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s3
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3]
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3]
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 {
+; GFX940-LABEL: byref_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: byref_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: v_mov_b32_e32 v2, s1
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_waitcnt vmcnt(0)
+; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: s_waitcnt vmcnt(0)
+; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
ret void
}
+; The second argument is not expected to be preloaded with the current behavior.
-define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_nop 1
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: v8i32_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_nop 1
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i32_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_nop 1
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: v8i32_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_nop 1
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i32_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_nop 1
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-NO-PRELOAD-NEXT: s_nop 0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 inreg %after.offset) #0 {
+; GFX940-LABEL: byref_staggered_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: byref_staggered_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: v_mov_b32_e32 v2, s1
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_waitcnt vmcnt(0)
+; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: s_waitcnt vmcnt(0)
+; GFX90a-NEXT: s_endpgm
+ %in = load i32, ptr addrspace(4) %in.byref
+ store volatile i32 %in, ptr addrspace(1) %out, align 4
+ store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+
+define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x i32> inreg %in) #0 {
+; GFX940-LABEL: v8i32_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
+; GFX940-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, s8
+; GFX940-NEXT: v_mov_b32_e32 v1, s9
+; GFX940-NEXT: v_mov_b32_e32 v2, s10
+; GFX940-NEXT: v_mov_b32_e32 v3, s11
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v8i32_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-NEXT: s_nop 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-NEXT: v_mov_b32_e32 v3, s11
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_short v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i16> inreg %in) #0 {
+; GFX940-LABEL: v3i16_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v3i16_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i32> inreg %in) #0 {
+; GFX940-LABEL: v3i32_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v3i32_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x float> inreg %in) #0 {
+; GFX940-LABEL: v3f32_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v3f32_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1
-; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-1-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-2-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-2-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-4-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-4-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-8-NEXT: global_store_byte v0, v1, s[0:1] offset:4
-; GFX90a-PRELOAD-8-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %out, <5 x i8> inreg %in) #0 {
+; GFX940-LABEL: v5i8_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
+; GFX940-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NEXT: global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
+; GFX940-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v5i8_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-NEXT: v_mov_b32_e32 v2, s9
+; GFX90a-NEXT: global_store_byte v1, v2, s[6:7] offset:4
+; GFX90a-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX90a-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_nop 1
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: v5f64_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_nop 1
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5f64_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_nop 1
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: v5f64_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_nop 1
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5f64_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x60
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[12:13] offset:32 sc0 sc1
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_nop 1
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-NO-PRELOAD-NEXT: s_nop 0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x double> inreg %in) #0 {
+; GFX940-LABEL: v5f64_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
+; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
+; GFX940-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; GFX940-NEXT: v_mov_b32_e32 v0, s8
+; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v1, s9
+; GFX940-NEXT: v_mov_b32_e32 v2, s10
+; GFX940-NEXT: v_mov_b32_e32 v3, s11
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v5f64_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX90a-NEXT: s_nop 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-NEXT: v_mov_b32_e32 v3, s11
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
}
-define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8> inreg %in) #0 {
+; GFX940-LABEL: v8i8_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s5, 8
+; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s5, 24
+; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s5, 16
+; GFX940-NEXT: v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v8i8_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s9, 8
+; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s9, 24
+; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s9, 16
+; GFX90a-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) #0 {
-; GFX940-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: i64_kernel_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i64 inreg %a) #0 {
+; GFX940-LABEL: i64_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i64_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
}
-define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) #0 {
-; GFX940-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
-; GFX940-NO-PRELOAD: ; %bb.0:
-; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-1: ; %bb.0:
-; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-2: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-4: ; %bb.0:
-; GFX940-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-8: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX940-PRELOAD-8-NEXT: s_endpgm
-;
-; GFX90a-NO-PRELOAD-LABEL: f64_kernel_preload_arg:
-; GFX90a-NO-PRELOAD: ; %bb.0:
-; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-NO-PRELOAD-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-1: ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-1-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-2: ; %bb.0:
-; GFX90a-PRELOAD-2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-2-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-4: ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-4-NEXT: s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-8: ; %bb.0:
-; GFX90a-PRELOAD-8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s2
-; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s3
-; GFX90a-PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX90a-PRELOAD-8-NEXT: s_endpgm
+define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, double inreg %in) #0 {
+; GFX940-LABEL: f64_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: f64_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
}
-attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in) #0 {
+; GFX940-LABEL: half_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: half_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store half %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, bfloat inreg %in) #0 {
+; GFX940-LABEL: bfloat_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: bfloat_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store bfloat %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <2 x bfloat> inreg %in) #0 {
+; GFX940-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store <2 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <3 x bfloat> inreg %in) #0 {
+; GFX940-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store <3 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <6 x bfloat> inreg %in) #0 {
+; GFX940-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store <6 x bfloat> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in, <7 x bfloat> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
+; GFX940-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, s9
+; GFX940-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: global_store_short v3, v0, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: s_endpgm
+ store half %in, ptr addrspace(1) %out
+ store <7 x bfloat> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 inreg %in) #0 {
+; GFX940-LABEL: i1_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s4, 1
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i1_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store i1 %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, fp128 inreg %in) #0 {
+; GFX940-LABEL: fp128_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-NEXT: v_mov_b32_e32 v3, s9
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: fp128_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v4, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v3, s13
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store fp128 %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x i8> inreg %in) #0 {
+; GFX940-LABEL: v7i8_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s4, 8
+; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: s_lshr_b32 s0, s5, 8
+; GFX940-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v3, s5
+; GFX940-NEXT: v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-NEXT: global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-NEXT: global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v7i8_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 8
+; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: s_lshr_b32 s0, s9, 8
+; GFX90a-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-NEXT: v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-NEXT: v_mov_b32_e32 v3, s9
+; GFX90a-NEXT: v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-NEXT: global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store <7 x i8> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x half> inreg %in) #0 {
+; GFX940-LABEL: v7half_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s9
+; GFX940-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: v7half_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s13
+; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store <7 x half> %in, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
+; GFX940-LABEL: i16_i32_kernel_preload_arg:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x10
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-NEXT: v_mov_b32_e32 v2, s7
+; GFX940-NEXT: global_store_short v0, v1, s[4:5] sc0 sc1
+; GFX940-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NEXT: v_mov_b32_e32 v2, s3
+; GFX90a-NEXT: global_store_short v0, v1, s[0:1]
+; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store i32 %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <3 x i32> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
+; GFX940-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: v_mov_b32_e32 v2, s8
+; GFX940-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: v_mov_b32_e32 v4, s8
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store <3 x i32> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i16 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
+; GFX940-LABEL: i16_i16_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store i16 %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <2 x i8> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
+; GFX940-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s4, 24
+; GFX940-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-NEXT: global_store_short v1, v2, s[2:3] sc0 sc1
+; GFX940-NEXT: global_store_short v1, v0, s[6:7] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-NEXT: v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
+; GFX90a-NEXT: v_mov_b32_e32 v1, 0
+; GFX90a-NEXT: v_mov_b32_e32 v2, s8
+; GFX90a-NEXT: v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-NEXT: global_store_short v1, v2, s[6:7]
+; GFX90a-NEXT: global_store_short v1, v0, s[10:11]
+; GFX90a-NEXT: s_endpgm
+ store i16 %in, ptr addrspace(1) %out
+ store <2 x i8> %in2, ptr addrspace(1) %out2
+ ret void
+}
+
+; The second argument is not expected to be preloaded with the current behavior.
+
+define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, ptr addrspace(1) %out, i32 inreg %arg1) #0 {
+; GFX940-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dword s3, s[0:1], 0x10
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_add_i32 s0, s2, s3
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NEXT: s_endpgm
+ %add = add i32 %arg0, %arg1
+ store i32 %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, i8 inreg %arg0, i32 inreg %unused) #0 {
+; GFX940-LABEL: ptr1_i8_trailing_unused:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s4, 0xff
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: ptr1_i8_trailing_unused:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %ext = zext i8 %arg0 to i32
+ store i32 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
More information about the llvm-commits
mailing list