[llvm] [AMDGPU] Allocate i1 argument to SGPRs (PR #72461)
Jun Wang via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 22 12:32:17 PDT 2024
================
@@ -0,0 +1,819 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) {
+; GFX9-LABEL: void_func_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1() {
+; GFX9-LABEL: test_call_void_func_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1 at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1(i1 %val)
+ ret void
+}
+
+define void @void_func_i1_zeroext(i1 zeroext %arg0) {
+; GFX9-LABEL: void_func_i1_zeroext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_or_b32_e32 v0, 12, v0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_zeroext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %ext = zext i1 %arg0 to i32
+ %add = add i32 %ext, 12
+ store i32 %add, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_zeroext() {
+; GFX9-LABEL: test_call_void_func_i1_zeroext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1_zeroext at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1_zeroext at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1_zeroext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1_zeroext at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1_zeroext at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_zeroext(i1 %val)
+ ret void
+}
+
+define void @void_func_i1_signext(i1 signext %arg0) {
+; GFX9-LABEL: void_func_i1_signext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_sub_u32_e32 v0, 12, v0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %ext = sext i1 %arg0 to i32
+ %add = add i32 %ext, 12
+ store i32 %add, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_signext() {
+; GFX9-LABEL: test_call_void_func_i1_signext:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1_signext at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1_signext at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+;
+; GFX11-LABEL: test_call_void_func_i1_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1_signext at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1_signext at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_signext(i1 %val)
+ ret void
+}
+
+define void @void_func_a2i1([2 x i1] %arg0) {
+; GFX9-LABEL: void_func_a2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store [2 x i1] %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_a2i1() {
+; GFX9-LABEL: test_call_void_func_a2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s10, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_a2i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_a2i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_mov_b64 s[6:7], -1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+;
+; GFX11-LABEL: test_call_void_func_a2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT; s_mov_b32 s4, s33
+; GFX11-NEXT; s_mov_b32 s33, s32
+; GFX11-NEXT; s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT; scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT; s_mov_b32 exec_lo, s0
+; GFX11-NEXT; s_add_i32 s32, s32, 16
+; GFX11-NEXT; s_getpc_b64 s[0:1]
+; GFX11-NEXT; s_add_u32 s0, s0, void_func_a2i1 at gotpcrel32@lo+4
+; GFX11-NEXT; s_addc_u32 s1, s1, void_func_a2i1 at gotpcrel32@hi+12
+; GFX11-NEXT; v_writelane_b32 v2, s30, 0
+; GFX11-NEXT; s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT; s_mov_b32 s0, 0
+; GFX11-NEXT; s_mov_b32 s1, -1
+; GFX11-NEXT; v_writelane_b32 v2, s31, 1
+; GFX11-NEXT; s_waitcnt lgkmcnt(0)
+; GFX11-NEXT; s_swappc_b64 s[30:31], s[2:3]
+ %1 = insertvalue [2 x i1] undef, i1 0, 0
+ %2 = insertvalue [2 x i1] %1, i1 1, 1
+ call void @void_func_a2i1([2 x i1] %2)
+ ret void
+}
+
+define void @i1_arg_i1_use(i1 %arg) {
+; CIGFX89-LABEL: i1_arg_i1_use:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
+; GFX9: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: .LBB{{[0-9]+}}_2:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_arg_i1_use:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s1, s0, -1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s1
+; GFX11: ; %bb.1: ; %bb1
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: .LBB{{[0-9]+}}_2: ; %bb2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+bb:
+ br i1 %arg, label %bb2, label %bb1
+
+bb1:
+ store volatile i32 0, ptr addrspace(1) undef
+ br label %bb2
+
+bb2:
+ ret void
+}
+
+define void @void_func_v2i1(<2 x i1> %arg0) {
+; GFX9-LABEL: void_func_v2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x i1> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_v2i1() {
+; GFX9-LABEL: test_call_void_func_v2i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s6, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_v2i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_v2i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+;
+; GFX11-LABEL: test_call_void_func_v2i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s2, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_v2i1 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_v2i1 at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+ %1 = insertelement <2 x i1> undef, i1 0, i32 0
+ %2 = insertelement <2 x i1> %1, i1 1, i32 1
+ call void @void_func_v2i1(<2 x i1> %2)
+ ret void
+}
+
+define void @void_func_i1_i1(i1 %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_i1_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile i1 %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @test_call_void_func_i1_i1() {
+; GFX9-LABEL: test_call_void_func_i1_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s10, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, void_func_i1_i1 at gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, void_func_i1_i1 at gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-NEXT: s_mov_b64 s[6:7], -1
+; GFX9-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
+;
+; GFX11-LABEL: test_call_void_func_i1_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s4, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, void_func_i1_i1 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, void_func_i1_i1 at gotpcrel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v2, s30, 0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-NEXT: s_mov_b32 s1, -1
+; GFX11-NEXT: v_writelane_b32 v2, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+ %val = load i1, ptr addrspace(1) undef
+ call void @void_func_i1_i1(i1 %val, i1 true)
+ ret void
+}
+
+define void @void_func_a2i1_i1([2 x i1] %arg0, i1 %arg1) {
+; GFX9-LABEL: void_func_a2i1_i1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX9-NEXT: global_store_byte v[0:1], v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX9-NEXT: global_store_byte v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a2i1_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT: global_store_b8 v[0:1], v1, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store volatile [2 x i1] %arg0, ptr addrspace(1) undef
+ store volatile i1 %arg1, ptr addrspace(1) undef
+ ret void
+}
+
+define void @many_i1_args(
+ i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, i1 %arg7,
+ i1 %arg8, i1 %arg9, i1 %arg10, i1 %arg11, i1 %arg12, i1 %arg13, i1 %arg14, i1 %arg15,
+ i1 %arg16, i1 %arg17, i1 %arg18, i1 %arg19, i1 %arg20, i1 %arg21, i1 %arg22, i1 %arg23,
+ i1 %arg24, i1 %arg25, i1 %arg26, i1 %arg27, i1 %arg28, i1 %arg29, i1 %arg30, i1 %arg31) {
----------------
jwanggit86 wrote:
No, after all the available SGPRs are used, the rest of the args are allocated to VGPRs.
https://github.com/llvm/llvm-project/pull/72461
More information about the llvm-commits
mailing list