[llvm] [AMDGPU] Support preloading hidden kernel arguments (PR #98861)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 15 18:31:40 PDT 2024
================
@@ -0,0 +1,597 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+
+define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_block_count_x:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_block_count_x:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s8
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 {
+; GFX940-LABEL: preload_unused_arg_block_count_x:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_unused_arg_block_count_x:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) {
+; GFX940-LABEL: no_free_sgprs_block_count_x:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dword s0, s[2:3], 0x28
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: no_free_sgprs_block_count_x:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dword s0, s[6:7], 0x28
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
+; GFX940-LABEL: no_inreg_block_count_x:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: no_inreg_block_count_x:
+; GFX90a: ; %bb.0:
+; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+; Implicit arg preloading is currently restricted to cases where all explicit
+; args are inreg (preloaded).
+
+define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 {
+; GFX940-LABEL: mixed_inreg_block_count_x:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_load_dword s4, s[0:1], 0x10
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: mixed_inreg_block_count_x:
+; GFX90a: ; %bb.0:
+; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: incorrect_type_i64_block_count_x:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: incorrect_type_i64_block_count_x:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX90a-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i64, ptr addrspace(4) %imp_arg_ptr
+ store i64 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: incorrect_type_i16_block_count_x:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_load_dword s0, s[0:1], 0x8
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: incorrect_type_i16_block_count_x:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i16, ptr addrspace(4) %imp_arg_ptr
+ store i16 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_block_count_y:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_block_count_y:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: random_incorrect_offset:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_mov_b32 s4, 8
+; GFX940-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: random_incorrect_offset:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_mov_b32 s0, 8
+; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_block_count_z:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s6
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_block_count_z:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 {
+; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s4, 0xff
+; GFX940-NEXT: s_add_i32 s0, s5, s0
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_add_i32 s0, s9, s0
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ %ext = zext i8 %val to i32
+ %add = add i32 %load, %ext
+ store i32 %add, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_block_count_xyz:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_block_count_xyz:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s8
+; GFX90a-NEXT: v_mov_b32_e32 v1, s9
+; GFX90a-NEXT: v_mov_b32_e32 v2, s10
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
+ %load_x = load i32, ptr addrspace(4) %gep_x
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load_y = load i32, ptr addrspace(4) %gep_y
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
+ %load_z = load i32, ptr addrspace(4) %gep_z
+ %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_workgroup_size_x:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s7, 0xffff
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_workgroup_size_x:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_workgroup_size_y:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s7, 16
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_workgroup_size_y:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_workgroup_size_z:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_workgroup_size_z:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_workgroup_size_xyz:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s7, 16
+; GFX940-NEXT: s_and_b32 s1, s7, 0xffff
+; GFX940-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_workgroup_size_xyz:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
+; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
+ %load_x = load i16, ptr addrspace(4) %gep_x
+ %conv_x = zext i16 %load_x to i32
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
+ %load_y = load i16, ptr addrspace(4) %gep_y
+ %conv_y = zext i16 %load_y to i32
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
+ %load_z = load i16, ptr addrspace(4) %gep_z
+ %conv_z = zext i16 %load_z to i32
+ %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preload_remainder_x:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s8, 16
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preload_remainder_x:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preloadremainder_y:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preloadremainder_y:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preloadremainder_z:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s9, 16
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preloadremainder_z:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s0
+; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load = load i16, ptr addrspace(4) %gep
+ %conv = zext i16 %load to i32
+ store i32 %conv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 {
+; GFX940-LABEL: preloadremainder_xyz:
+; GFX940: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-NEXT: ; %bb.0:
+; GFX940-NEXT: s_lshr_b32 s0, s9, 16
+; GFX940-NEXT: s_lshr_b32 s1, s8, 16
+; GFX940-NEXT: s_and_b32 s4, s9, 0xffff
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: v_mov_b32_e32 v2, s0
+; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
+;
+; GFX90a-LABEL: preloadremainder_xyz:
+; GFX90a: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-NEXT: ; %bb.0:
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
+; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
+; GFX90a-NEXT: v_mov_b32_e32 v3, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, s1
+; GFX90a-NEXT: v_mov_b32_e32 v1, s2
+; GFX90a-NEXT: v_mov_b32_e32 v2, s0
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: s_endpgm
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
+ %load_x = load i16, ptr addrspace(4) %gep_x
+ %conv_x = zext i16 %load_x to i32
+ %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
+ %load_y = load i16, ptr addrspace(4) %gep_y
+ %conv_y = zext i16 %load_y to i32
+ %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
+ %load_z = load i16, ptr addrspace(4) %gep_z
+ %conv_z = zext i16 %load_z to i32
+ %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0
+ %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
+ %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
+ store <3 x i32> %ins.2, ptr addrspace(1) %out
+ ret void
+}
+
----------------
kerbowa wrote:
There are many examples of this in preload-kernargs.ll with normal arguments. I added a test here where we don't have enough user SGPRs to preload an implicit_arg.
https://github.com/llvm/llvm-project/pull/98861
More information about the llvm-commits
mailing list