[llvm] [NFC][AMDGPU] Autogenerate tests for uniform i32 promo in ISel (PR #106382)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 29 01:48:37 PDT 2024
================
@@ -26,34 +74,73 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
ret void
}
-; GCN-LABEL: {{^}}v_abs_v2i16:
-; GFX9: global_load_dword [[VAL:v[0-9]+]]
-; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
-; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
-
-; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, [[ZERO]], v{{[0-9]+}}
-; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI-DAG: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
-; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NOT: v_and_b32
-; VI: v_or_b32_e32
-
-; CI: buffer_load_dword v
-; CI: v_lshrrev_b32_e32
-; CI-DAG: v_sub_i32_e32
-; CI-DAG: v_bfe_i32
-; CI-DAG: v_bfe_i32
-; CI-DAG: v_max_i32
-; CI-DAG: v_max_i32
-; CI-DAG: v_add_i32
-; CI-DAG: v_add_i32
-; CI-DAG: v_or_b32
define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
+; GFX9-LABEL: v_abs_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v2, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v2
+; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; VI-LABEL: v_abs_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_mov_b32_e32 v5, 2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u16_e32 v2, 0, v3
+; VI-NEXT: v_sub_u16_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_max_i16_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_max_i16_e32 v2, v3, v2
+; VI-NEXT: v_add_u16_e32 v2, 2, v2
+; VI-NEXT: v_add_u16_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; CI-LABEL: v_abs_v2i16:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_bfe_i32 v3, v2, 0, 16
+; CI-NEXT: v_ashrrev_i32_e32 v4, 16, v2
+; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; CI-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
+; CI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; CI-NEXT: v_sub_i32_e32 v5, vcc, 0, v5
+; CI-NEXT: v_bfe_i32 v5, v5, 0, 16
+; CI-NEXT: v_max_i32_e32 v2, v3, v2
+; CI-NEXT: v_max_i32_e32 v3, v4, v5
+; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v2
+; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_or_b32_e32 v2, v3, v2
+; CI-NEXT: v_add_i32_e32 v2, vcc, 0x20000, v2
+; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
----------------
arsenm wrote:
Like the other file these really should just use a regular function to get VGPR arguments, all this other noise is to just get a divergent VGPR in a kernel
https://github.com/llvm/llvm-project/pull/106382
More information about the llvm-commits
mailing list