[llvm-bugs] [Bug 40346] New: Clang generates unnecessary vectorized code for _mm_popcnt_u64
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Jan 16 20:26:09 PST 2019
https://bugs.llvm.org/show_bug.cgi?id=40346
Bug ID: 40346
Summary: Clang generates unnecessary vectorized code for
_mm_popcnt_u64
Product: libraries
Version: 7.0
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: me at adhokshajmishraonline.in
CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
llvm-dev at redking.me.uk, spatel+llvm at rotateright.com
Created attachment 21343
--> https://bugs.llvm.org/attachment.cgi?id=21343&action=edit
Test source code, dumped assmebler source code, and LLVM IR code
Clang is generating technically correct (behaviour of code is preserved), but
very slow code with no POPCNT instruction, and too much vectorization when
_mm_popcnt_u64() intrinsic is used with -O2 and -O3.
NOTE
----
1. When code is compiled with -O1, there is no unnecessary vectorization, and
generated code almost expected, but compiler fails to take false dependency of
these instructions into account, which results in slower code.
2. I tested the poc with LZCNT and TZCNT as well. Again, compiler does not
account for false dependency; although no unnecessary vectorization is present.
Behaviour in (1) and (2) is consistent with bug #33869.
G++ on the other han, generates correct code, and deals with false dependency
correctly. Code generated by G++ has been provided for reference.
Platform Details
================
CPU: Intel(R) Core(TM) i7-6700HQ CPU
OS: Arch Linux x86_64 Kernel Version 4.20.1-arch1-1-ARCH
Compilers: g++ (GCC) 8.2.1 20181127
clang version 7.0.1 (tags/RELEASE_701/final)
Test Code
==========
#include <iostream>
#include <chrono>
#include <x86intrin.h>
int main(int argc, char* argv[]) {
using namespace std;
uint64_t size = 10<<20;
uint64_t* buffer = new uint64_t[size/8];
char* charbuffer = reinterpret_cast<char*>(buffer);
for (unsigned i=0; i<size; ++i)
charbuffer[i] = rand()%256;
uint64_t count,duration;
chrono::time_point<chrono::system_clock> startP,endP;
{
startP = chrono::system_clock::now();
count=0;
for( unsigned k = 0; k < 10000; k++){
// Tight unrolled loop with uint64_t
for (uint64_t i=0;i<size/8;i+=4) {
count += _mm_popcnt_u64(buffer[i]);
count += _mm_popcnt_u64(buffer[i+1]);
count += _mm_popcnt_u64(buffer[i+2]);
count += _mm_popcnt_u64(buffer[i+3]);
}
}
endP = chrono::system_clock::now();
duration =
chrono::duration_cast<std::chrono::nanoseconds>(endP-startP).count();
cout << "Counter\t" << count << "\tSpeed\t" <<
(10000.0*size)/(duration) << " GB/s" << endl;
}
free(charbuffer);
}
OUTPUT (POPCNT)
===============
By GCC
------
[code snipped]
...
mov r13,rax
xor ebx,ebx
xor edx,edx
xor eax,eax
popcnt rax,QWORD PTR [r14+0x8]
popcnt rdx,QWORD PTR [r14]
add rdx,rax
xor eax,eax
popcnt rax,QWORD PTR [r14+0x10]
add rdx,rax
xor eax,eax
popcnt rax,QWORD PTR [r14+0x18]
add rdx,rax
mov eax,0x2710
nop DWORD PTR [rax+0x0]
add rbx,rdx
dec eax
jne 1150 <main+0x80>
add r14,0x20
cmp r14,rbp
jne 111c <main+0x4c>
...
[code snipped]
By Clang
--------
Note: please refer to attachment for complete listing.
[code snipped]
...
vpbroadcastq ymm2,QWORD PTR [rip+0xe26]
vpbroadcastq ymm4,QWORD PTR [rip+0xe25]
vpbroadcastq ymm7,QWORD PTR [rip+0xe24]
vbroadcastsd ymm0,QWORD PTR [rip+0xe23]
vmovups YMMWORD PTR [rsp+0x80],ymm0
vmovdqa ymm5,YMMWORD PTR [rip+0xd9a]
vmovdqa ymm6,YMMWORD PTR [rip+0xdb2]
vbroadcastsd ymm0,QWORD PTR [rip+0xe09]
vmovups YMMWORD PTR [rsp+0x60],ymm0
vpbroadcastq ymm0,QWORD PTR [rip+0xe02]
vmovdqu YMMWORD PTR [rsp+0x40],ymm0
vpbroadcastq ymm8,QWORD PTR [rip+0xdfb]
xor r15d,r15d
vmovdqu YMMWORD PTR [rsp+0xe0],ymm2
vmovdqu YMMWORD PTR [rsp+0xc0],ymm4
vmovdqu YMMWORD PTR [rsp+0xa0],ymm7
vmovdqu YMMWORD PTR [rsp+0x20],ymm8
nop DWORD PTR [rax+0x0]
vmovq xmm1,r15
mov eax,0x50000
vmovdqa ymm11,YMMWORD PTR [rip+0xd0e]
vpxor xmm3,xmm3,xmm3
vpxor xmm15,xmm15,xmm15
vpxor xmm0,xmm0,xmm0
nop
vpaddq ymm12,ymm11,ymm2
vpaddq ymm13,ymm11,ymm4
vpaddq ymm14,ymm11,ymm7
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm4,QWORD PTR [rbx+ymm11*8],ymm2
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm8,QWORD PTR [rbx+ymm12*8],ymm2
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm10,QWORD PTR [rbx+ymm13*8],ymm2
vpcmpeqd ymm2,ymm2,ymm2
vpgatherqq ymm9,QWORD PTR [rbx+ymm14*8],ymm2
vpand ymm2,ymm4,ymm5
vpshufb ymm2,ymm6,ymm2
vpsrlw ymm4,ymm4,0x4
vpand ymm4,ymm4,ymm5
vpshufb ymm4,ymm6,ymm4
vpaddb ymm2,ymm4,ymm2
vpsadbw ymm2,ymm2,YMMWORD PTR [rip+0xd09]
vpaddq ymm1,ymm2,ymm1
vmovdqu YMMWORD PTR [rsp],ymm1
vpand ymm2,ymm8,ymm5
vpshufb ymm2,ymm6,ymm2
vpsrlw ymm4,ymm8,0x4
vpand ymm4,ymm4,ymm5
vpshufb ymm4,ymm6,ymm4
vpaddb ymm2,ymm4,ymm2
vpsadbw ymm2,ymm2,YMMWORD PTR [rip+0xcdc]
...
[code snipped]
OUTPUT (TZCNT)
==============
By GCC
------
[code snipped]
...
mov r13,rax
xor ebx,ebx
xor edi,edi
xor edx,edx
tzcnt rdi,QWORD PTR [r14]
tzcnt rdx,QWORD PTR [r14+0x8]
xor esi,esi
tzcnt rsi,QWORD PTR [r14+0x10]
xor ecx,ecx
tzcnt rcx,QWORD PTR [r14+0x18]
add rdx,rdi
add rdx,rsi
mov eax,0x2710
add rdx,rcx
nop DWORD PTR [rax+0x0]
add rbx,rdx
dec eax
jne 1150 <main+0x80>
add r14,0x20
cmp r14,rbp
jne 111c <main+0x4c>
...
[code snipped]
By Clang
--------
[code snipped]
...
mov r14,rax
xor ebx,ebx
nop WORD PTR cs:[rax+rax*1+0x0]
nop DWORD PTR [rax]
xor eax,eax
nop WORD PTR cs:[rax+rax*1+0x0]
nop DWORD PTR [rax+0x0]
tzcnt rcx,QWORD PTR [r15+rax*8]
add rcx,rbx
tzcnt rdx,QWORD PTR [r15+rax*8+0x8]
add rdx,rcx
tzcnt rcx,QWORD PTR [r15+rax*8+0x10]
add rcx,rdx
tzcnt rdx,QWORD PTR [r15+rax*8+0x18]
add rdx,rcx
tzcnt rcx,QWORD PTR [r15+rax*8+0x20]
add rcx,rdx
tzcnt rdx,QWORD PTR [r15+rax*8+0x28]
add rdx,rcx
tzcnt rcx,QWORD PTR [r15+rax*8+0x30]
add rcx,rdx
tzcnt rbx,QWORD PTR [r15+rax*8+0x38]
add rbx,rcx
add rax,0x8
cmp rax,0x140000
jb 1290 <main+0x60>
add r12d,0x1
cmp r12d,0x2710
jne 1280 <main+0x50>
...
[code snipped]
OUTPUT (LZCNT)
==============
By GCC
------
[code snipped]
...
mov r13,rax
xor ebx,ebx
xor edi,edi
xor edx,edx
lzcnt rdi,QWORD PTR [r14]
lzcnt rdx,QWORD PTR [r14+0x8]
xor esi,esi
add rdx,rdi
lzcnt rsi,QWORD PTR [r14+0x10]
xor ecx,ecx
add rdx,rsi
lzcnt rcx,QWORD PTR [r14+0x18]
mov eax,0x2710
add rdx,rcx
nop DWORD PTR [rax+0x0]
add rbx,rdx
dec eax
jne 1150 <main+0x80>
add r14,0x20
cmp r14,rbp
jne 111c <main+0x4c>
...
[code snipped]
By Clang
--------
[code snipped]
...
mov r14,rax
xor ebx,ebx
nop WORD PTR cs:[rax+rax*1+0x0]
nop DWORD PTR [rax]
xor eax,eax
nop WORD PTR cs:[rax+rax*1+0x0]
nop DWORD PTR [rax+0x0]
lzcnt rcx,QWORD PTR [r15+rax*8]
add rcx,rbx
lzcnt rdx,QWORD PTR [r15+rax*8+0x8]
add rdx,rcx
lzcnt rcx,QWORD PTR [r15+rax*8+0x10]
add rcx,rdx
lzcnt rdx,QWORD PTR [r15+rax*8+0x18]
add rdx,rcx
lzcnt rcx,QWORD PTR [r15+rax*8+0x20]
add rcx,rdx
lzcnt rdx,QWORD PTR [r15+rax*8+0x28]
add rdx,rcx
lzcnt rcx,QWORD PTR [r15+rax*8+0x30]
add rcx,rdx
lzcnt rbx,QWORD PTR [r15+rax*8+0x38]
add rbx,rcx
add rax,0x8
cmp rax,0x140000
jb 1290 <main+0x60>
add r12d,0x1
cmp r12d,0x2710
jne 1280 <main+0x50>
...
[code snipped]
Test code, assembly generated by Clang, and IR code are attached herewith in
ZIP format.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190117/cea84bb1/attachment-0001.html>
More information about the llvm-bugs
mailing list