[llvm] [LLVM] Make use of s_flbit_i32_b64 and s_ff1_i32_b64 (PR #75158)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 19 02:00:21 PST 2023


================
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
+
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+
+define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: ctlz_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v2, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ctlz_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
----------------
jayfoad wrote:

Can you also add a test for the `i1 false` version, and the same for cttz?

https://github.com/llvm/llvm-project/pull/75158


More information about the llvm-commits mailing list