[llvm] [AMDGPU] Support preloading hidden kernel arguments (PR #98861)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 2 11:18:47 PDT 2024
================
@@ -0,0 +1,698 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor < %s| llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=16 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 | FileCheck -check-prefixes=GFX940-PRELOAD %s
+
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -amdgpu-kernarg-preload-count=16 < %s | llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a | FileCheck -check-prefixes=GFX90a-PRELOAD %s
+
+define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0xc
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_y:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0xc
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_y:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_z:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_z:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s5, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 0xff
+; GFX940-NO-PRELOAD-NEXT: s_add_i32 s0, s5, s0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_and_b32 s0, s4, 0xff
+; GFX940-PRELOAD-NEXT: s_add_i32 s0, s5, s0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xff
+; GFX90a-NO-PRELOAD-NEXT: s_add_i32 s2, s3, s2
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-PRELOAD-NEXT: s_add_i32 s0, s9, s0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ %ext = zext i8 %val to i32
+ %add = add i32 %load, %ext
+ store i32 %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s2, s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s6, s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_block_count_xyz:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
+ %load_x = load i32, ptr addrspace(4) %gep_x
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load_y = load i32, ptr addrspace(4) %gep_y
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+ %load_z = load i32, ptr addrspace(4) %gep_z
+ %ins.0 = insertelement <3 x i32> undef, i32 %load_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_and_b32 s0, s7, 0xffff
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_x:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x14
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x14
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_y:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_z:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT: global_load_dword v0, v3, s[0:1] offset:20
+; GFX940-NO-PRELOAD-NEXT: global_load_ushort v2, v3, s[0:1] offset:24
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_lshr_b32 s0, s7, 16
+; GFX940-PRELOAD-NEXT: s_and_b32 s1, s7, 0xffff
+; GFX940-PRELOAD-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT: global_load_dword v0, v3, s[4:5] offset:20
+; GFX90a-NO-PRELOAD-NEXT: global_load_ushort v2, v3, s[4:5] offset:24
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_workgroup_size_xyz:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-PRELOAD-NEXT: s_and_b32 s1, s11, 0xffff
+; GFX90a-PRELOAD-NEXT: s_and_b32 s2, s12, 0xffff
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+ %load_x = load i16, ptr addrspace(4) %gep_x
+ %conv_x = zext i16 %load_x to i32
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+ %load_y = load i16, ptr addrspace(4) %gep_y
+ %conv_y = zext i16 %load_y to i32
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+ %load_z = load i16, ptr addrspace(4) %gep_z
+ %conv_z = zext i16 %load_z to i32
+ %ins.0 = insertelement <3 x i32> undef, i32 %conv_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x18
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preload_remainder_x:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_lshr_b32 s0, s8, 16
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x18
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preload_remainder_x:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_y:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_y:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: s_load_dword s4, s[0:1], 0x1c
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_z:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_lshr_b32 s0, s9, 16
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x1c
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: s_lshr_b32 s2, s2, 16
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_z:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) {
+; GFX940-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-NO-PRELOAD: ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT: global_load_dword v0, v3, s[0:1] offset:26
+; GFX940-NO-PRELOAD-NEXT: global_load_ushort v2, v3, s[0:1] offset:30
+; GFX940-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(1)
+; GFX940-NO-PRELOAD-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX940-NO-PRELOAD-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX940-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX940-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX940-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-NEXT: ; %bb.0:
+; GFX940-PRELOAD-NEXT: s_lshr_b32 s0, s9, 16
+; GFX940-PRELOAD-NEXT: s_lshr_b32 s1, s8, 16
+; GFX940-PRELOAD-NEXT: s_and_b32 s4, s9, 0xffff
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-NEXT: v_mov_b32_e32 v2, s0
+; GFX940-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-NO-PRELOAD: ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT: global_load_dword v0, v3, s[4:5] offset:26
+; GFX90a-NO-PRELOAD-NEXT: global_load_ushort v2, v3, s[4:5] offset:30
+; GFX90a-NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(1)
+; GFX90a-NO-PRELOAD-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX90a-NO-PRELOAD-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX90a-NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NO-PRELOAD-NEXT: s_endpgm
+;
+; GFX90a-PRELOAD-LABEL: preloadremainder_xyz:
+; GFX90a-PRELOAD: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-NEXT: ; %bb.0:
+; GFX90a-PRELOAD-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-PRELOAD-NEXT: s_lshr_b32 s1, s12, 16
+; GFX90a-PRELOAD-NEXT: s_and_b32 s2, s13, 0xffff
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-NEXT: v_mov_b32_e32 v2, s0
+; GFX90a-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+ %load_x = load i16, ptr addrspace(4) %gep_x
+ %conv_x = zext i16 %load_x to i32
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+ %load_y = load i16, ptr addrspace(4) %gep_y
+ %conv_y = zext i16 %load_y to i32
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load_z = load i16, ptr addrspace(4) %gep_z
+ %conv_z = zext i16 %load_z to i32
+ %ins.0 = insertelement <3 x i32> undef, i32 %conv_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
----------------
arsenm wrote:
Also have some handwritten IR with unexpected inreg placements (like mixed inreg and not-inreg arguments), and some load as aggregate
https://github.com/llvm/llvm-project/pull/98861
More information about the llvm-commits
mailing list