[llvm-bugs] [Bug 40346] New: Clang generates unnecessary vectorized code for _mm_popcnt_u64

Wed Jan 16 20:26:09 PST 2019

https://bugs.llvm.org/show_bug.cgi?id=40346

            Bug ID: 40346
           Summary: Clang generates unnecessary vectorized code for
                    _mm_popcnt_u64
           Product: libraries
           Version: 7.0
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: me at adhokshajmishraonline.in
                CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
                    llvm-dev at redking.me.uk, spatel+llvm at rotateright.com

Created attachment 21343
  --> https://bugs.llvm.org/attachment.cgi?id=21343&action=edit
Test source code, dumped assmebler source code, and LLVM IR code

Clang is generating technically correct (behaviour of code is preserved), but
very slow code with no POPCNT instruction, and too much vectorization when
_mm_popcnt_u64() intrinsic is used with -O2 and -O3. 

NOTE
----
1. When code is compiled with -O1, there is no unnecessary vectorization, and
generated code almost expected, but compiler fails to take false dependency of
these instructions into account, which results in slower code.

2. I tested the poc with LZCNT and TZCNT as well. Again, compiler does not
account for false dependency; although no unnecessary vectorization is present.

Behaviour in (1) and (2) is consistent with bug #33869.

G++ on the other han, generates correct code, and deals with false dependency
correctly. Code generated by G++ has been provided for reference.

Platform Details
================

CPU:       Intel(R) Core(TM) i7-6700HQ CPU
OS:        Arch Linux x86_64 Kernel Version 4.20.1-arch1-1-ARCH
Compilers: g++ (GCC) 8.2.1 20181127
           clang version 7.0.1 (tags/RELEASE_701/final)

Test Code
==========

#include <iostream>
#include <chrono>
#include <x86intrin.h>

int main(int argc, char* argv[]) {

    using namespace std;

    uint64_t size = 10<<20;
    uint64_t* buffer = new uint64_t[size/8];
    char* charbuffer = reinterpret_cast<char*>(buffer);
    for (unsigned i=0; i<size; ++i)
        charbuffer[i] = rand()%256;

    uint64_t count,duration;
    chrono::time_point<chrono::system_clock> startP,endP;
    {
        startP = chrono::system_clock::now();
        count=0;
        for( unsigned k = 0; k < 10000; k++){
            // Tight unrolled loop with uint64_t
            for (uint64_t i=0;i<size/8;i+=4) {
                count += _mm_popcnt_u64(buffer[i]);
                count += _mm_popcnt_u64(buffer[i+1]);
                count += _mm_popcnt_u64(buffer[i+2]);
                count += _mm_popcnt_u64(buffer[i+3]);
            }
        }
        endP = chrono::system_clock::now();
        duration =
chrono::duration_cast<std::chrono::nanoseconds>(endP-startP).count();
        cout << "Counter\t"  << count << "\tSpeed\t" <<
(10000.0*size)/(duration) << " GB/s" << endl;
    }

    free(charbuffer);
}

OUTPUT (POPCNT)
===============

By GCC
------

[code snipped]
...
mov    r13,rax
xor    ebx,ebx
xor    edx,edx
xor    eax,eax
popcnt rax,QWORD PTR [r14+0x8]
popcnt rdx,QWORD PTR [r14]
add    rdx,rax
xor    eax,eax
popcnt rax,QWORD PTR [r14+0x10]
add    rdx,rax
xor    eax,eax
popcnt rax,QWORD PTR [r14+0x18]
add    rdx,rax
mov    eax,0x2710
nop    DWORD PTR [rax+0x0]
add    rbx,rdx
dec    eax
jne    1150 <main+0x80>
add    r14,0x20
cmp    r14,rbp
jne    111c <main+0x4c>
...
[code snipped]

By Clang
--------

Note: please refer to attachment for complete listing.

[code snipped]
...
vpbroadcastq ymm2,QWORD PTR [rip+0xe26]
vpbroadcastq ymm4,QWORD PTR [rip+0xe25]
vpbroadcastq ymm7,QWORD PTR [rip+0xe24]
vbroadcastsd ymm0,QWORD PTR [rip+0xe23]

vmovups YMMWORD PTR [rsp+0x80],ymm0
vmovdqa ymm5,YMMWORD PTR [rip+0xd9a]
vmovdqa ymm6,YMMWORD PTR [rip+0xdb2]
vbroadcastsd ymm0,QWORD PTR [rip+0xe09]
vmovups YMMWORD PTR [rsp+0x60],ymm0
vpbroadcastq ymm0,QWORD PTR [rip+0xe02]

vmovdqu YMMWORD PTR [rsp+0x40],ymm0
vpbroadcastq ymm8,QWORD PTR [rip+0xdfb]

xor    r15d,r15d
vmovdqu YMMWORD PTR [rsp+0xe0],ymm2
vmovdqu YMMWORD PTR [rsp+0xc0],ymm4
vmovdqu YMMWORD PTR [rsp+0xa0],ymm7
vmovdqu YMMWORD PTR [rsp+0x20],ymm8
nop    DWORD PTR [rax+0x0]
vmovq  xmm1,r15
mov    eax,0x50000
vmovdqa ymm11,YMMWORD PTR [rip+0xd0e]

vpxor  xmm3,xmm3,xmm3
vpxor  xmm15,xmm15,xmm15
vpxor  xmm0,xmm0,xmm0
nop
vpaddq ymm12,ymm11,ymm2
vpaddq ymm13,ymm11,ymm4
vpaddq ymm14,ymm11,ymm7
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm4,QWORD PTR [rbx+ymm11*8],ymm2
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm8,QWORD PTR [rbx+ymm12*8],ymm2
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm10,QWORD PTR [rbx+ymm13*8],ymm2
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm9,QWORD PTR [rbx+ymm14*8],ymm2
vpand  ymm2,ymm4,ymm5
vpshufb ymm2,ymm6,ymm2
vpsrlw ymm4,ymm4,0x4
vpand  ymm4,ymm4,ymm5
vpshufb ymm4,ymm6,ymm4
vpaddb ymm2,ymm4,ymm2
vpsadbw ymm2,ymm2,YMMWORD PTR [rip+0xd09]

vpaddq ymm1,ymm2,ymm1
vmovdqu YMMWORD PTR [rsp],ymm1
vpand  ymm2,ymm8,ymm5
vpshufb ymm2,ymm6,ymm2
vpsrlw ymm4,ymm8,0x4
vpand  ymm4,ymm4,ymm5
vpshufb ymm4,ymm6,ymm4
vpaddb ymm2,ymm4,ymm2
vpsadbw ymm2,ymm2,YMMWORD PTR [rip+0xcdc]
...
[code snipped]

OUTPUT (TZCNT)
==============

By GCC
------

[code snipped]
...
mov    r13,rax
xor    ebx,ebx
xor    edi,edi
xor    edx,edx
tzcnt  rdi,QWORD PTR [r14]
tzcnt  rdx,QWORD PTR [r14+0x8]
xor    esi,esi
tzcnt  rsi,QWORD PTR [r14+0x10]
xor    ecx,ecx
tzcnt  rcx,QWORD PTR [r14+0x18]
add    rdx,rdi
add    rdx,rsi
mov    eax,0x2710
add    rdx,rcx
nop    DWORD PTR [rax+0x0]
add    rbx,rdx
dec    eax
jne    1150 <main+0x80>
add    r14,0x20
cmp    r14,rbp
jne    111c <main+0x4c>
...
[code snipped]

By Clang
--------

[code snipped]
...
mov    r14,rax
xor    ebx,ebx
nop    WORD PTR cs:[rax+rax*1+0x0]

nop    DWORD PTR [rax]
xor    eax,eax
nop    WORD PTR cs:[rax+rax*1+0x0]

nop    DWORD PTR [rax+0x0]
tzcnt  rcx,QWORD PTR [r15+rax*8]
add    rcx,rbx
tzcnt  rdx,QWORD PTR [r15+rax*8+0x8]
add    rdx,rcx
tzcnt  rcx,QWORD PTR [r15+rax*8+0x10]
add    rcx,rdx
tzcnt  rdx,QWORD PTR [r15+rax*8+0x18]
add    rdx,rcx
tzcnt  rcx,QWORD PTR [r15+rax*8+0x20]
add    rcx,rdx
tzcnt  rdx,QWORD PTR [r15+rax*8+0x28]
add    rdx,rcx
tzcnt  rcx,QWORD PTR [r15+rax*8+0x30]
add    rcx,rdx
tzcnt  rbx,QWORD PTR [r15+rax*8+0x38]
add    rbx,rcx
add    rax,0x8
cmp    rax,0x140000
jb     1290 <main+0x60>
add    r12d,0x1
cmp    r12d,0x2710
jne    1280 <main+0x50>
...
[code snipped]

OUTPUT (LZCNT)
==============

By GCC
------

[code snipped]
...
mov    r13,rax
xor    ebx,ebx
xor    edi,edi
xor    edx,edx
lzcnt  rdi,QWORD PTR [r14]
lzcnt  rdx,QWORD PTR [r14+0x8]
xor    esi,esi
add    rdx,rdi
lzcnt  rsi,QWORD PTR [r14+0x10]
xor    ecx,ecx
add    rdx,rsi
lzcnt  rcx,QWORD PTR [r14+0x18]
mov    eax,0x2710
add    rdx,rcx
nop    DWORD PTR [rax+0x0]
add    rbx,rdx
dec    eax
jne    1150 <main+0x80>
add    r14,0x20
cmp    r14,rbp
jne    111c <main+0x4c>
...
[code snipped]

By Clang
--------

[code snipped]
...
mov    r14,rax
xor    ebx,ebx
nop    WORD PTR cs:[rax+rax*1+0x0]

nop    DWORD PTR [rax]
xor    eax,eax
nop    WORD PTR cs:[rax+rax*1+0x0]

nop    DWORD PTR [rax+0x0]
lzcnt  rcx,QWORD PTR [r15+rax*8]
add    rcx,rbx
lzcnt  rdx,QWORD PTR [r15+rax*8+0x8]
add    rdx,rcx
lzcnt  rcx,QWORD PTR [r15+rax*8+0x10]
add    rcx,rdx
lzcnt  rdx,QWORD PTR [r15+rax*8+0x18]
add    rdx,rcx
lzcnt  rcx,QWORD PTR [r15+rax*8+0x20]
add    rcx,rdx
lzcnt  rdx,QWORD PTR [r15+rax*8+0x28]
add    rdx,rcx
lzcnt  rcx,QWORD PTR [r15+rax*8+0x30]
add    rcx,rdx
lzcnt  rbx,QWORD PTR [r15+rax*8+0x38]
add    rbx,rcx
add    rax,0x8
cmp    rax,0x140000
jb     1290 <main+0x60>
add    r12d,0x1
cmp    r12d,0x2710
jne    1280 <main+0x50>
...
[code snipped]

Test code, assembly generated by Clang, and IR code are attached herewith in
ZIP format.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190117/cea84bb1/attachment-0001.html>