[llvm-bugs] [Bug 50168] New: LLVM produces more shuffles than necessary
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Apr 29 03:40:14 PDT 2021
https://bugs.llvm.org/show_bug.cgi?id=50168
Bug ID: 50168
Summary: LLVM produces more shuffles than necessary
Product: libraries
Version: trunk
Hardware: PC
OS: Windows NT
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: david.bolvansky at gmail.com
CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
llvm-dev at redking.me.uk, pengfei.wang at intel.com,
spatel+llvm at rotateright.com
>From Reddit:
"Clang successfully vectorizes the whole routine, though it uses more shuffles
than necessary. "
#include <stdint.h>
void filter(float *__restrict dst, const float *__restrict src, const float
*__restrict filter) {
float pipeline[16] {};
for(int i=0; i<1024; ++i) {
float x = *src++;
for(int j=0; j<16; ++j)
pipeline[j] += filter[j] * x;
*dst++ = pipeline[0];
for(int j=0; j<15; ++j)
pipeline[j] = pipeline[j+1];
pipeline[15] = 0;
}
}
Trunk -O3
.LCPI0_0:
.long 0x00000000 # float 0
filter(float*, float const*, float const*): # @filter(float*, float const*,
float const*)
movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
movss dword ptr [rsp - 4], xmm0 # 4-byte Spill
movups xmm10, xmmword ptr [rdx + 20]
movups xmm11, xmmword ptr [rdx + 4]
movups xmm12, xmmword ptr [rdx + 36]
movss xmm13, dword ptr [rdx + 52] # xmm13 = mem[0],zero,zero,zero
movss xmm14, dword ptr [rdx + 56] # xmm14 = mem[0],zero,zero,zero
movss xmm15, dword ptr [rdx + 60] # xmm15 = mem[0],zero,zero,zero
xorps xmm7, xmm7
xor eax, eax
xorps xmm1, xmm1
xorps xmm0, xmm0
xorps xmm5, xmm5
xorps xmm9, xmm9
xorps xmm3, xmm3
.LBB0_1: # =>This Inner Loop Header: Depth=1
movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero
movaps xmm8, xmm6
mulss xmm8, dword ptr [rsp - 4] # 4-byte Folded Reload
addss xmm8, xmm7
movaps xmm4, xmm6
shufps xmm4, xmm6, 0 # xmm4 = xmm4[0,0],xmm6[0,0]
movaps xmm2, xmm9
shufps xmm2, xmm7, 48 # xmm2 = xmm2[0,0],xmm7[3,0]
shufps xmm7, xmm2, 41 # xmm7 = xmm7[1,2],xmm2[2,0]
movaps xmm2, xmm4
mulps xmm2, xmm11
addps xmm7, xmm2
movaps xmm2, xmm3
shufps xmm2, xmm9, 48 # xmm2 = xmm2[0,0],xmm9[3,0]
shufps xmm9, xmm2, 41 # xmm9 = xmm9[1,2],xmm2[2,0]
movaps xmm2, xmm4
mulps xmm2, xmm10
addps xmm9, xmm2
shufps xmm1, xmm3, 244 # xmm1 = xmm1[0,1],xmm3[3,3]
shufps xmm3, xmm1, 41 # xmm3 = xmm3[1,2],xmm1[2,0]
mulps xmm4, xmm12
addps xmm3, xmm4
movaps xmm1, xmm6
mulss xmm1, xmm13
addss xmm1, xmm0
movaps xmm0, xmm6
mulss xmm0, xmm14
addss xmm0, xmm5
movss dword ptr [rdi + 4*rax], xmm8
mulss xmm6, xmm15
addss xmm6, dword ptr [rip + .LCPI0_0]
add rax, 1
movaps xmm5, xmm6
cmp eax, 1024
jne .LBB0_1
ret
https://gcc.godbolt.org/z/ofz8qde9e
LLVM 12+ started to produce more shuffles.
LLVM 11 -O3
filter(float*, float const*, float const*): # @filter(float*, float const*,
float const*)
xorps xmm2, xmm2
movaps xmmword ptr [rsp - 24], xmm2
movaps xmmword ptr [rsp - 40], xmm2
movaps xmmword ptr [rsp - 56], xmm2
movaps xmmword ptr [rsp - 72], xmm2
xorps xmm8, xmm8
xor eax, eax
xorps xmm0, xmm0
xorps xmm3, xmm3
xorps xmm4, xmm4
.LBB0_1: # =>This Inner Loop Header: Depth=1
movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero
movss xmm5, dword ptr [rdx] # xmm5 = mem[0],zero,zero,zero
mulss xmm5, xmm6
addss xmm5, xmm0
movups xmm7, xmmword ptr [rdx + 8]
movaps xmm1, xmm6
shufps xmm1, xmm6, 0 # xmm1 = xmm1[0,0],xmm6[0,0]
mulps xmm7, xmm1
addps xmm7, xmm2
movss xmm0, dword ptr [rdx + 4] # xmm0 = mem[0],zero,zero,zero
mulss xmm0, xmm6
movups xmmword ptr [rsp - 64], xmm7
movups xmm2, xmmword ptr [rdx + 24]
mulps xmm2, xmm1
addps xmm2, xmm3
movups xmmword ptr [rsp - 48], xmm2
movups xmm2, xmmword ptr [rdx + 40]
mulps xmm2, xmm1
addps xmm2, xmm4
movups xmmword ptr [rsp - 32], xmm2
movss xmm1, dword ptr [rdx + 56] # xmm1 = mem[0],zero,zero,zero
mulss xmm1, xmm6
addss xmm1, dword ptr [rsp - 16]
addss xmm0, xmm8
movss dword ptr [rsp - 16], xmm1
mulss xmm6, dword ptr [rdx + 60]
addss xmm6, dword ptr [rsp - 12]
movss dword ptr [rsp - 12], xmm6
movss dword ptr [rdi + 4*rax], xmm5
mov r8d, dword ptr [rsp - 64]
movups xmm2, xmmword ptr [rsp - 60]
movups xmmword ptr [rsp - 64], xmm2
movups xmm3, xmmword ptr [rsp - 44]
movups xmmword ptr [rsp - 48], xmm3
mov ecx, dword ptr [rsp - 28]
mov dword ptr [rsp - 32], ecx
movaps xmm1, xmmword ptr [rsp - 24]
movups xmmword ptr [rsp - 28], xmm1
mov dword ptr [rsp - 12], 0
movd xmm8, r8d
movd xmm4, ecx
shufps xmm4, xmm1, 0 # xmm4 = xmm4[0,0],xmm1[0,0]
shufps xmm4, xmm1, 152 # xmm4 = xmm4[0,2],xmm1[1,2]
add rax, 1
cmp eax, 1024
jne .LBB0_1
ret
https://gcc.godbolt.org/z/4c3EKcxsa
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210429/aa0511bd/attachment.html>
More information about the llvm-bugs
mailing list