[llvm-bugs] [Bug 42674] [X86][SSE] Missed vXi8 sum reduction optimization
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Jul 31 23:29:02 PDT 2019
https://bugs.llvm.org/show_bug.cgi?id=42674
Joel Yliluoma <bisqwit at iki.fi> changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|FIXED |---
Status|RESOLVED |REOPENED
--- Comment #10 from Joel Yliluoma <bisqwit at iki.fi> ---
I am sorry about reopening this once again, but Clang still generates
suboptimal code for vector size 8 and totally forfeits SIMD for 128:
On 8 bytes, Clang trunk:
vpmovzxbw xmm0, qword ptr [rip + bytes]
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpaddw xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3]
vpaddw xmm0, xmm0, xmm1
vpsrld xmm1, xmm0, 16
vpaddw xmm0, xmm0, xmm1
vpextrb eax, xmm0, 0
ret
On 128 bytes, Clang trunk:
mov rcx, -128
xor eax, eax
.LBB0_1:
add al, byte ptr [rcx + bytes+128]
add al, byte ptr [rcx + bytes+129]
add al, byte ptr [rcx + bytes+130]
add al, byte ptr [rcx + bytes+131]
add al, byte ptr [rcx + bytes+132]
add al, byte ptr [rcx + bytes+133]
add al, byte ptr [rcx + bytes+134]
add al, byte ptr [rcx + bytes+135]
add rcx, 8
jne .LBB0_1
ret
Test code:
#define SIZE 128
unsigned char bytes[SIZE];
unsigned char sum (void)
{
unsigned char r = 0;
const unsigned char *p = (const unsigned char *) bytes;
int n;
for (n = 0; n < sizeof (bytes); ++n)
r += p[n];
return r;
}
For reference, here’s GCC (trunk) for 8:
vmovq xmm0, QWORD PTR bytes[rip]
vpxor xmm1, xmm1, xmm1
vpsadbw xmm2, xmm0, xmm1
vmovq rax, xmm2
ret
and GCC (trunk) for 128 (AVX-512BW):
vmovdqu8 zmm2, ZMMWORD PTR bytes[rip]
vpaddb zmm1, zmm2, ZMMWORD PTR bytes[rip+64]
vextracti64x4 ymm0, zmm1, 0x1
vpaddb ymm1, ymm0, ymm1
vextracti128 xmm0, ymm1, 0x1
vpaddb xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 8
vpaddb xmm0, xmm0, xmm1
vpxor xmm1, xmm1, xmm1
vpsadbw xmm0, xmm0, xmm1
vpextrb eax, xmm0, 0
vzeroupper
ret
or GCC (trunk) for 128 (non-AVX):
movdqa xmm0, XMMWORD PTR bytes[rip]
paddb xmm0, XMMWORD PTR bytes[rip+16]
paddb xmm0, XMMWORD PTR bytes[rip+48]
paddb xmm0, XMMWORD PTR bytes[rip+32]
paddb xmm0, XMMWORD PTR bytes[rip+80]
paddb xmm0, XMMWORD PTR bytes[rip+64]
paddb xmm0, XMMWORD PTR bytes[rip+96]
paddb xmm0, XMMWORD PTR bytes[rip+112]
movdqa xmm1, xmm0
psrldq xmm1, 8
paddb xmm0, xmm1
pxor xmm1, xmm1
movdqa xmm2, xmm0
psadbw xmm2, xmm1
movaps XMMWORD PTR [rsp-40], xmm2
movzx eax, BYTE PTR [rsp-40]
ret
On 64 and 256 Clang does fine now. On 96 it does sort of fine, but GCC does
better.
Clang (trunk) on 96, with AVX2 and no AVX512BW:
vmovdqu ymm0, ymmword ptr [rip + bytes+32]
vpaddb ymm0, ymm0, ymmword ptr [rip + bytes+64]
vextracti128 xmm1, ymm0, 1
vpaddb xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpaddb xmm0, xmm0, xmm1
vpxor xmm1, xmm1, xmm1
vpsadbw xmm0, xmm0, xmm1
vpextrb ecx, xmm0, 0
vmovdqa xmm0, xmmword ptr [rip + bytes]
vpaddb xmm0, xmm0, xmmword ptr [rip + bytes+16]
vpshufd xmm2, xmm0, 78 # xmm2 = xmm0[2,3,0,1]
vpaddb xmm0, xmm0, xmm2
vpsadbw xmm0, xmm0, xmm1
vpextrb eax, xmm0, 0
add al, cl
vzeroupper
ret
GCC (trunk) on 96, with AVX2 and no AVX512BW:
vmovdqu ymm2, YMMWORD PTR bytes[rip]
vpaddb ymm0, ymm2, YMMWORD PTR bytes[rip+32]
vpaddb ymm1, ymm0, YMMWORD PTR bytes[rip+64]
vextracti128 xmm0, ymm1, 0x1
vpaddb xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 8
vpaddb xmm0, xmm0, xmm1
vpxor xmm1, xmm1, xmm1
vpsadbw xmm0, xmm0, xmm1
vpextrb eax, xmm0, 0
vzeroupper
ret
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190801/3e8bca9f/attachment-0001.html>
More information about the llvm-bugs
mailing list