[llvm] b5afda8 - AMDGPU: Add some more ctlz_zero_undef tests
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed May 8 07:13:28 PDT 2024
Author: Matt Arsenault
Date: 2024-05-08T16:13:22+02:00
New Revision: b5afda8d760998641cf08a6d229252924b0ad146
URL: https://github.com/llvm/llvm-project/commit/b5afda8d760998641cf08a6d229252924b0ad146
DIFF: https://github.com/llvm/llvm-project/commit/b5afda8d760998641cf08a6d229252924b0ad146.diff
LOG: AMDGPU: Add some more ctlz_zero_undef tests
Added:
Modified:
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 21aff62b9226d..54adde38d6d22 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -4,9 +4,17 @@
; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
+declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
+declare i18 @llvm.ctlz.i18(i18, i1) nounwind readnone
+
+declare <2 x i16> @llvm.ctlz.v2i16(<2 x i16>, i1) nounwind readnone
+declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) nounwind readnone
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
@@ -2158,3 +2166,468 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
store i32 %sel, ptr addrspace(1) %out
ret void
}
+
+define i7 @v_ctlz_zero_undef_i7(i7 %val) {
+; SI-LABEL: v_ctlz_zero_undef_i7:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_i7:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: v_add_u16_e32 v0, -9, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_i7:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
+ ret i7 %ctlz
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i18:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_and_b32 s2, s2, 0x3ffff
+; SI-NEXT: s_flbit_i32_b32 s2, s2
+; SI-NEXT: s_add_i32 s4, s2, -14
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_bfe_u32 s4, s4, 0x20010
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i18:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_and_b32 s2, s2, 0x3ffff
+; VI-NEXT: s_flbit_i32_b32 s2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_i32 s2, s2, -14
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_u32 s0, s0, 2
+; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_byte v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i18:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 30, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T2.X
+; EG-NEXT: CF_END
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 262143(3.673406e-40), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: -14(nan), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T1.X, PV.W, PS,
+; EG-NEXT: LSHL * T1.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T1.Y, 0.0,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T3.W, PV.W, literal.x,
+; EG-NEXT: MOV * T4.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 2(2.802597e-45)
+; EG-NEXT: BFE_UINT T0.W, T0.W, literal.x, PS,
+; EG-NEXT: LSHL * T3.W, PV.W, literal.y,
+; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV T1.Z, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR T2.X, T2.W, literal.x,
+; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0x3ffff
+; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 14
+; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff
+; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-GISEL-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT: s_endpgm
+ %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone
+ store i18 %ctlz, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define i18 @v_ctlz_zero_undef_i18(i18 %val) {
+; SI-LABEL: v_ctlz_zero_undef_i18:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_i18:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_i18:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
+ ret i18 %ctlz
+}
+
+define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i18:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0
+; SI-NEXT: v_add_i32_e32 v1, vcc, -14, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i18:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1
+; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
+; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, -14, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i18:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x3ffff, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 14, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
+ ret <2 x i18> %ctlz
+}
+
+define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i16:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
+ ret <2 x i16> %ctlz
+}
+
+define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v3i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_ffbh_u32_e32 v2, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT: v_or_b32_e32 v2, 0x100000, v2
+; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v3i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v3i16:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
+ ret <3 x i16> %ctlz
+}
+
+define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v4i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_ffbh_u32_e32 v3, v3
+; SI-NEXT: v_ffbh_u32_e32 v2, v2
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v4i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: v_ffbh_u32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0xfff00000, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v4i16:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
+ ret <4 x i16> %ctlz
+}
+
+define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i8:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0xffffe800, v0
+; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i8:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_add_u16_e32 v1, 0xe800, v1
+; VI-NEXT: v_subrev_u16_e32 v0, 24, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i8:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
+ ret <2 x i8> %ctlz
+}
+
+define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i7:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0x7f, v1
+; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0
+; SI-NEXT: v_subrev_i32_e32 v1, vcc, 25, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i7:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v2, 0x7f007f, v0
+; VI-NEXT: v_bfe_u32 v0, v0, 16, 7
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: v_add_u16_e32 v1, -9, v0
+; VI-NEXT: v_and_b32_e32 v0, 0x7f, v2
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: v_add_u16_e32 v0, -9, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i7:
+; EG: ; %bb.0:
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x7f, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 25, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)
+ ret <2 x i7> %ctlz
+}
More information about the llvm-commits
mailing list