[llvm] b5afda8 - AMDGPU: Add some more ctlz_zero_undef tests

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed May 8 07:13:28 PDT 2024


Author: Matt Arsenault
Date: 2024-05-08T16:13:22+02:00
New Revision: b5afda8d760998641cf08a6d229252924b0ad146

URL: https://github.com/llvm/llvm-project/commit/b5afda8d760998641cf08a6d229252924b0ad146
DIFF: https://github.com/llvm/llvm-project/commit/b5afda8d760998641cf08a6d229252924b0ad146.diff

LOG: AMDGPU: Add some more ctlz_zero_undef tests

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 21aff62b9226d..54adde38d6d22 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -4,9 +4,17 @@
 ; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
 
+declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
+declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
 
 declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
+declare i18 @llvm.ctlz.i18(i18, i1) nounwind readnone
+
+declare <2 x i16> @llvm.ctlz.v2i16(<2 x i16>, i1) nounwind readnone
+declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) nounwind readnone
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
@@ -2158,3 +2166,468 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
   store i32 %sel, ptr addrspace(1) %out
   ret void
 }
+
+define i7 @v_ctlz_zero_undef_i7(i7 %val) {
+; SI-LABEL: v_ctlz_zero_undef_i7:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 25, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_i7:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_add_u16_e32 v0, -9, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_i7:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
+  ret i7 %ctlz
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i18:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s2, s2, 0x3ffff
+; SI-NEXT:    s_flbit_i32_b32 s2, s2
+; SI-NEXT:    s_add_i32 s4, s2, -14
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_bfe_u32 s4, s4, 0x20010
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i18:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 0x3ffff
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_i32 s2, s2, -14
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_add_u32 s0, s0, 2
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    s_bfe_u32 s2, s2, 0x20010
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i18:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 30, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T2.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    262143(3.673406e-40), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    -14(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T2.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT:     LSHL T1.X, PV.W, PS,
+; EG-NEXT:     LSHL * T1.W, literal.x, PS,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     MOV T1.Y, 0.0,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T3.W, PV.W, literal.x,
+; EG-NEXT:     MOV * T4.W, literal.y,
+; EG-NEXT:    3(4.203895e-45), 2(2.802597e-45)
+; EG-NEXT:     BFE_UINT T0.W, T0.W, literal.x, PS,
+; EG-NEXT:     LSHL * T3.W, PV.W, literal.y,
+; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV T1.Z, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR T2.X, T2.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0x3ffff
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 14
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s0, 0x3ffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone
+  store i18 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define i18 @v_ctlz_zero_undef_i18(i18 %val) {
+; SI-LABEL: v_ctlz_zero_undef_i18:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -14, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_i18:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -14, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_i18:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
+  ret i18 %ctlz
+}
+
+define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i18:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -14, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -14, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i18:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -14, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -14, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i18:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 14, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
+  ret <2 x i18> %ctlz
+}
+
+define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i16:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
+  ret <2 x i16> %ctlz
+}
+
+define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v3i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_ffbh_u32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT:    v_or_b32_e32 v2, 0x100000, v2
+; SI-NEXT:    v_alignbit_b32 v1, v3, v0, 16
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v3i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v3i16:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
+  ret <3 x i16> %ctlz
+}
+
+define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v4i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v3, v3
+; SI-NEXT:    v_ffbh_u32_e32 v2, v2
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_ffbh_u32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0xfff00000, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v4i16:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
+  ret <4 x i16> %ctlz
+}
+
+define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xffffe800, v0
+; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v1, 0xe800, v1
+; VI-NEXT:    v_subrev_u16_e32 v0, 24, v0
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i8:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
+  ret <2 x i8> %ctlz
+}
+
+define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
+; SI-LABEL: v_ctlz_zero_undef_v2i7:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 25, v0
+; SI-NEXT:    v_subrev_i32_e32 v1, vcc, 25, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_ctlz_zero_undef_v2i7:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v2, 0x7f007f, v0
+; VI-NEXT:    v_bfe_u32 v0, v0, 16, 7
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_add_u16_e32 v1, -9, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v2
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT:    v_add_u16_e32 v0, -9, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: v_ctlz_zero_undef_v2i7:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 25, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)
+  ret <2 x i7> %ctlz
+}


        


More information about the llvm-commits mailing list