[libc-commits] [libc] [libc][__support][bit] Switch popcount to Brian Kernighan’s Algorithm (PR #95625)

via libc-commits libc-commits at lists.llvm.org
Fri Jun 14 18:24:21 PDT 2024


lntue wrote:

> Interesting, under clang `-O3 -march=x86-64-v3 `
> 
> ```c++
> unsigned popcount(unsigned value) {
>   int count = 0;
>   for (int i = 0; i != 32; ++i)
>     if ((value >> i) & 0x1)
>       ++count;
> 
>   return count;
> }
> 
> 
> unsigned popcount2(unsigned value) {
>   int count = 0;
>    while (value) {
>     value &= value - 1;
>     ++count;
>   }
> 
>   return count;
> }
> ```
> 
> compiles to
> 
> ```assembly
> .LCPI0_0:
>         .long   1                               # 0x1
>         .long   2                               # 0x2
>         .long   3                               # 0x3
>         .long   4                               # 0x4
> .LCPI0_2:
>         .long   3                               # 0x3
>         .long   0                               # 0x0
>         .long   1                               # 0x1
>         .long   2                               # 0x2
> .LCPI0_5:
>         .long   24                              # 0x18
>         .long   25                              # 0x19
>         .long   26                              # 0x1a
>         .long   27                              # 0x1b
> .LCPI0_1:
>         .long   8                               # 0x8
>         .long   9                               # 0x9
>         .long   10                              # 0xa
>         .long   11                              # 0xb
>         .long   12                              # 0xc
>         .long   13                              # 0xd
>         .long   14                              # 0xe
>         .long   15                              # 0xf
> .LCPI0_4:
>         .long   16                              # 0x10
>         .long   17                              # 0x11
>         .long   18                              # 0x12
>         .long   19                              # 0x13
>         .long   20                              # 0x14
>         .long   21                              # 0x15
>         .long   22                              # 0x16
>         .long   23                              # 0x17
> .LCPI0_3:
>         .long   1                               # 0x1
> popcount(unsigned int):                           # @popcount(unsigned int)
>         vmovd   xmm0, edi
>         vpbroadcastd    ymm0, xmm0
>         vpsrlvd xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
>         mov     eax, edi
>         vpsrlvd ymm2, ymm0, ymmword ptr [rip + .LCPI0_1]
>         shr     eax, 5
>         mov     ecx, edi
>         shr     ecx, 6
>         mov     edx, edi
>         shr     edx, 7
>         vbroadcasti128  ymm3, xmmword ptr [rip + .LCPI0_2] # ymm3 = [3,0,1,2,3,0,1,2]
>         vpermd  ymm1, ymm3, ymm1
>         vpblendd        ymm1, ymm1, ymm0, 1             # ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7]
>         vmovd   xmm3, eax
>         vpbroadcastd    ymm3, xmm3
>         vpblendd        ymm1, ymm1, ymm3, 32            # ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
>         vmovd   xmm3, ecx
>         vpbroadcastd    ymm3, xmm3
>         vpblendd        ymm1, ymm1, ymm3, 64            # ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
>         vmovd   xmm3, edx
>         vpbroadcastd    ymm3, xmm3
>         vpblendd        ymm1, ymm1, ymm3, 128           # ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
>         vpbroadcastd    ymm3, dword ptr [rip + .LCPI0_3] # ymm3 = [1,1,1,1,1,1,1,1]
>         vpsrlvd ymm4, ymm0, ymmword ptr [rip + .LCPI0_4]
>         vpand   ymm2, ymm2, ymm3
>         vpand   ymm1, ymm1, ymm3
>         vpaddd  ymm1, ymm1, ymm2
>         vpand   ymm2, ymm4, ymm3
>         vpsrlvd xmm0, xmm0, xmmword ptr [rip + .LCPI0_5]
>         vpand   xmm0, xmm0, xmm3
>         mov     eax, edi
>         shr     eax, 29
>         and     eax, 1
>         mov     ecx, edi
>         shr     ecx, 31
>         vextracti128    xmm3, ymm1, 1
>         vpaddd  xmm1, xmm1, xmm3
>         vpshufd xmm3, xmm1, 238                 # xmm3 = xmm1[2,3,2,3]
>         vpaddd  xmm1, xmm1, xmm3
>         vpshufd xmm3, xmm1, 85                  # xmm3 = xmm1[1,1,1,1]
>         vpaddd  xmm1, xmm1, xmm3
>         vmovd   edx, xmm1
>         vextracti128    xmm1, ymm2, 1
>         vpaddd  xmm1, xmm2, xmm1
>         vpshufd xmm2, xmm1, 238                 # xmm2 = xmm1[2,3,2,3]
>         vpaddd  xmm1, xmm1, xmm2
>         vpshufd xmm2, xmm1, 85                  # xmm2 = xmm1[1,1,1,1]
>         vpaddd  xmm1, xmm1, xmm2
>         vmovd   esi, xmm1
>         add     esi, edx
>         vpshufd xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
>         vpaddd  xmm0, xmm0, xmm1
>         vpshufd xmm1, xmm0, 85                  # xmm1 = xmm0[1,1,1,1]
>         vpaddd  xmm0, xmm0, xmm1
>         vmovd   edx, xmm0
>         bt      edi, 28
>         adc     edx, esi
>         bt      edi, 30
>         adc     eax, 0
>         add     eax, ecx
>         add     eax, edx
>         vzeroupper
>         ret
> popcount2(unsigned int):                          # @popcount2(unsigned int)
>         popcnt  eax, edi
>         ret
> ```

Look like clang has pattern recognition for this PR's implementation.

https://github.com/llvm/llvm-project/pull/95625


More information about the libc-commits mailing list