[clang] [llvm] [AMDGPU] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 (PR #164847)

Fri Oct 31 22:35:43 PDT 2025

================
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+
+define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt032_not_for_vregs:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT:    global_load_dword v2, v[2:3], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bcnt_u32_b32 v2, v2, 0
+; CHECK-NEXT:    v_sub_u32_e32 v3, 32, v2
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v3
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    global_store_dword v[0:1], v2, off
+; CHECK-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+  %val0 = load volatile i32, ptr addrspace(1) %gep
+  %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %result2 = sub i32 32, %result
+  call void asm "; use $0", "s"(i32 %result2)
+  %cmp = icmp ne i32 %result2, 0
+  %zext = zext i1 %cmp to i32
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt064_not_for_vregs:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
+; CHECK-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    v_bcnt_u32_b32 v2, v2, 0
+; CHECK-NEXT:    v_bcnt_u32_b32 v3, v3, v2
+; CHECK-NEXT:    v_sub_co_u32_e32 v5, vcc, 64, v3
+; CHECK-NEXT:    v_subb_co_u32_e64 v6, s[0:1], 0, 0, vcc
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[5:6]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[3:4], off
+; CHECK-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+  %val0 = load volatile i64, ptr addrspace(1) %gep
----------------
arsenm wrote:

volatile load is ineligible for the VALU load to scalar load optimization. For your purposes, it is simpler to use an inreg argument to the shader calling convention rather than all of this boilerplate to load the value from memory 

https://github.com/llvm/llvm-project/pull/164847