<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - LLVM produces more shuffles than necessary"
href="https://bugs.llvm.org/show_bug.cgi?id=50168">50168</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>LLVM produces more shuffles than necessary
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Windows NT
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>david.bolvansky@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, pengfei.wang@intel.com, spatel+llvm@rotateright.com
</td>
</tr></table>
<p>
<div>
<pre>From Reddit:
"Clang successfully vectorizes the whole routine, though it uses more shuffles
than necessary. "
#include <stdint.h>
void filter(float *__restrict dst, const float *__restrict src, const float
*__restrict filter) {
float pipeline[16] {};
for(int i=0; i<1024; ++i) {
float x = *src++;
for(int j=0; j<16; ++j)
pipeline[j] += filter[j] * x;
*dst++ = pipeline[0];
for(int j=0; j<15; ++j)
pipeline[j] = pipeline[j+1];
pipeline[15] = 0;
}
}
Trunk -O3
.LCPI0_0:
.long 0x00000000 # float 0
filter(float*, float const*, float const*): # @filter(float*, float const*,
float const*)
movss xmm0, dword ptr [rdx] # xmm0 = mem[0],zero,zero,zero
movss dword ptr [rsp - 4], xmm0 # 4-byte Spill
movups xmm10, xmmword ptr [rdx + 20]
movups xmm11, xmmword ptr [rdx + 4]
movups xmm12, xmmword ptr [rdx + 36]
movss xmm13, dword ptr [rdx + 52] # xmm13 = mem[0],zero,zero,zero
movss xmm14, dword ptr [rdx + 56] # xmm14 = mem[0],zero,zero,zero
movss xmm15, dword ptr [rdx + 60] # xmm15 = mem[0],zero,zero,zero
xorps xmm7, xmm7
xor eax, eax
xorps xmm1, xmm1
xorps xmm0, xmm0
xorps xmm5, xmm5
xorps xmm9, xmm9
xorps xmm3, xmm3
.LBB0_1: # =>This Inner Loop Header: Depth=1
movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero
movaps xmm8, xmm6
mulss xmm8, dword ptr [rsp - 4] # 4-byte Folded Reload
addss xmm8, xmm7
movaps xmm4, xmm6
shufps xmm4, xmm6, 0 # xmm4 = xmm4[0,0],xmm6[0,0]
movaps xmm2, xmm9
shufps xmm2, xmm7, 48 # xmm2 = xmm2[0,0],xmm7[3,0]
shufps xmm7, xmm2, 41 # xmm7 = xmm7[1,2],xmm2[2,0]
movaps xmm2, xmm4
mulps xmm2, xmm11
addps xmm7, xmm2
movaps xmm2, xmm3
shufps xmm2, xmm9, 48 # xmm2 = xmm2[0,0],xmm9[3,0]
shufps xmm9, xmm2, 41 # xmm9 = xmm9[1,2],xmm2[2,0]
movaps xmm2, xmm4
mulps xmm2, xmm10
addps xmm9, xmm2
shufps xmm1, xmm3, 244 # xmm1 = xmm1[0,1],xmm3[3,3]
shufps xmm3, xmm1, 41 # xmm3 = xmm3[1,2],xmm1[2,0]
mulps xmm4, xmm12
addps xmm3, xmm4
movaps xmm1, xmm6
mulss xmm1, xmm13
addss xmm1, xmm0
movaps xmm0, xmm6
mulss xmm0, xmm14
addss xmm0, xmm5
movss dword ptr [rdi + 4*rax], xmm8
mulss xmm6, xmm15
addss xmm6, dword ptr [rip + .LCPI0_0]
add rax, 1
movaps xmm5, xmm6
cmp eax, 1024
jne .LBB0_1
ret
<a href="https://gcc.godbolt.org/z/ofz8qde9e">https://gcc.godbolt.org/z/ofz8qde9e</a>
LLVM 12+ started to produce more shuffles.
LLVM 11 -O3
filter(float*, float const*, float const*): # @filter(float*, float const*,
float const*)
xorps xmm2, xmm2
movaps xmmword ptr [rsp - 24], xmm2
movaps xmmword ptr [rsp - 40], xmm2
movaps xmmword ptr [rsp - 56], xmm2
movaps xmmword ptr [rsp - 72], xmm2
xorps xmm8, xmm8
xor eax, eax
xorps xmm0, xmm0
xorps xmm3, xmm3
xorps xmm4, xmm4
.LBB0_1: # =>This Inner Loop Header: Depth=1
movss xmm6, dword ptr [rsi + 4*rax] # xmm6 = mem[0],zero,zero,zero
movss xmm5, dword ptr [rdx] # xmm5 = mem[0],zero,zero,zero
mulss xmm5, xmm6
addss xmm5, xmm0
movups xmm7, xmmword ptr [rdx + 8]
movaps xmm1, xmm6
shufps xmm1, xmm6, 0 # xmm1 = xmm1[0,0],xmm6[0,0]
mulps xmm7, xmm1
addps xmm7, xmm2
movss xmm0, dword ptr [rdx + 4] # xmm0 = mem[0],zero,zero,zero
mulss xmm0, xmm6
movups xmmword ptr [rsp - 64], xmm7
movups xmm2, xmmword ptr [rdx + 24]
mulps xmm2, xmm1
addps xmm2, xmm3
movups xmmword ptr [rsp - 48], xmm2
movups xmm2, xmmword ptr [rdx + 40]
mulps xmm2, xmm1
addps xmm2, xmm4
movups xmmword ptr [rsp - 32], xmm2
movss xmm1, dword ptr [rdx + 56] # xmm1 = mem[0],zero,zero,zero
mulss xmm1, xmm6
addss xmm1, dword ptr [rsp - 16]
addss xmm0, xmm8
movss dword ptr [rsp - 16], xmm1
mulss xmm6, dword ptr [rdx + 60]
addss xmm6, dword ptr [rsp - 12]
movss dword ptr [rsp - 12], xmm6
movss dword ptr [rdi + 4*rax], xmm5
mov r8d, dword ptr [rsp - 64]
movups xmm2, xmmword ptr [rsp - 60]
movups xmmword ptr [rsp - 64], xmm2
movups xmm3, xmmword ptr [rsp - 44]
movups xmmword ptr [rsp - 48], xmm3
mov ecx, dword ptr [rsp - 28]
mov dword ptr [rsp - 32], ecx
movaps xmm1, xmmword ptr [rsp - 24]
movups xmmword ptr [rsp - 28], xmm1
mov dword ptr [rsp - 12], 0
movd xmm8, r8d
movd xmm4, ecx
shufps xmm4, xmm1, 0 # xmm4 = xmm4[0,0],xmm1[0,0]
shufps xmm4, xmm1, 152 # xmm4 = xmm4[0,2],xmm1[1,2]
add rax, 1
cmp eax, 1024
jne .LBB0_1
ret
<a href="https://gcc.godbolt.org/z/4c3EKcxsa">https://gcc.godbolt.org/z/4c3EKcxsa</a></pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>