[libc-commits] [libc] [libc][__support][bit] Switch popcount to Brian Kernighan’s Algorithm (PR #95625)
via libc-commits
libc-commits at lists.llvm.org
Fri Jun 14 18:24:21 PDT 2024
lntue wrote:
> Interesting, under clang `-O3 -march=x86-64-v3 `
>
> ```c++
> unsigned popcount(unsigned value) {
> int count = 0;
> for (int i = 0; i != 32; ++i)
> if ((value >> i) & 0x1)
> ++count;
>
> return count;
> }
>
>
> unsigned popcount2(unsigned value) {
> int count = 0;
> while (value) {
> value &= value - 1;
> ++count;
> }
>
> return count;
> }
> ```
>
> compiles to
>
> ```assembly
> .LCPI0_0:
> .long 1 # 0x1
> .long 2 # 0x2
> .long 3 # 0x3
> .long 4 # 0x4
> .LCPI0_2:
> .long 3 # 0x3
> .long 0 # 0x0
> .long 1 # 0x1
> .long 2 # 0x2
> .LCPI0_5:
> .long 24 # 0x18
> .long 25 # 0x19
> .long 26 # 0x1a
> .long 27 # 0x1b
> .LCPI0_1:
> .long 8 # 0x8
> .long 9 # 0x9
> .long 10 # 0xa
> .long 11 # 0xb
> .long 12 # 0xc
> .long 13 # 0xd
> .long 14 # 0xe
> .long 15 # 0xf
> .LCPI0_4:
> .long 16 # 0x10
> .long 17 # 0x11
> .long 18 # 0x12
> .long 19 # 0x13
> .long 20 # 0x14
> .long 21 # 0x15
> .long 22 # 0x16
> .long 23 # 0x17
> .LCPI0_3:
> .long 1 # 0x1
> popcount(unsigned int): # @popcount(unsigned int)
> vmovd xmm0, edi
> vpbroadcastd ymm0, xmm0
> vpsrlvd xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
> mov eax, edi
> vpsrlvd ymm2, ymm0, ymmword ptr [rip + .LCPI0_1]
> shr eax, 5
> mov ecx, edi
> shr ecx, 6
> mov edx, edi
> shr edx, 7
> vbroadcasti128 ymm3, xmmword ptr [rip + .LCPI0_2] # ymm3 = [3,0,1,2,3,0,1,2]
> vpermd ymm1, ymm3, ymm1
> vpblendd ymm1, ymm1, ymm0, 1 # ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7]
> vmovd xmm3, eax
> vpbroadcastd ymm3, xmm3
> vpblendd ymm1, ymm1, ymm3, 32 # ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
> vmovd xmm3, ecx
> vpbroadcastd ymm3, xmm3
> vpblendd ymm1, ymm1, ymm3, 64 # ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
> vmovd xmm3, edx
> vpbroadcastd ymm3, xmm3
> vpblendd ymm1, ymm1, ymm3, 128 # ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
> vpbroadcastd ymm3, dword ptr [rip + .LCPI0_3] # ymm3 = [1,1,1,1,1,1,1,1]
> vpsrlvd ymm4, ymm0, ymmword ptr [rip + .LCPI0_4]
> vpand ymm2, ymm2, ymm3
> vpand ymm1, ymm1, ymm3
> vpaddd ymm1, ymm1, ymm2
> vpand ymm2, ymm4, ymm3
> vpsrlvd xmm0, xmm0, xmmword ptr [rip + .LCPI0_5]
> vpand xmm0, xmm0, xmm3
> mov eax, edi
> shr eax, 29
> and eax, 1
> mov ecx, edi
> shr ecx, 31
> vextracti128 xmm3, ymm1, 1
> vpaddd xmm1, xmm1, xmm3
> vpshufd xmm3, xmm1, 238 # xmm3 = xmm1[2,3,2,3]
> vpaddd xmm1, xmm1, xmm3
> vpshufd xmm3, xmm1, 85 # xmm3 = xmm1[1,1,1,1]
> vpaddd xmm1, xmm1, xmm3
> vmovd edx, xmm1
> vextracti128 xmm1, ymm2, 1
> vpaddd xmm1, xmm2, xmm1
> vpshufd xmm2, xmm1, 238 # xmm2 = xmm1[2,3,2,3]
> vpaddd xmm1, xmm1, xmm2
> vpshufd xmm2, xmm1, 85 # xmm2 = xmm1[1,1,1,1]
> vpaddd xmm1, xmm1, xmm2
> vmovd esi, xmm1
> add esi, edx
> vpshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
> vpaddd xmm0, xmm0, xmm1
> vpshufd xmm1, xmm0, 85 # xmm1 = xmm0[1,1,1,1]
> vpaddd xmm0, xmm0, xmm1
> vmovd edx, xmm0
> bt edi, 28
> adc edx, esi
> bt edi, 30
> adc eax, 0
> add eax, ecx
> add eax, edx
> vzeroupper
> ret
> popcount2(unsigned int): # @popcount2(unsigned int)
> popcnt eax, edi
> ret
> ```
Look like clang has pattern recognition for this PR's implementation.
https://github.com/llvm/llvm-project/pull/95625
More information about the libc-commits
mailing list