<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - [X86] [AVX2] Suboptimal code for vector select fetching cond from bool[] array"
href="https://bugs.llvm.org/show_bug.cgi?id=37427">37427</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>[X86] [AVX2] Suboptimal code for vector select fetching cond from bool[] array
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>6.0
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Windows NT
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>enhancement
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>fabiang@radgametools.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr></table>
<p>
<div>
<pre>void variable_shift_loop(unsigned int *arr, const bool *control, int count, int
rot0, int rot1)
{
for (int i = 0; i < count; ++i)
{
int rot = control[i] ? rot1 : rot0;
arr[i] = (arr[i] << rot);
}
}
Clang 6.0 targeting x86-64, "-O2 -mavx2".
The inner loop does:
vpxor %xmm2, %xmm2, %xmm2
.LBB0_4: # =>This Inner Loop Header: Depth=1
vpmovzxbw (%rsi,%rcx), %xmm3 # xmm3 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxbw 8(%rsi,%rcx), %xmm4 # xmm4 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxbw 16(%rsi,%rcx), %xmm5 # xmm5 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpmovzxbw 24(%rsi,%rcx), %xmm6 # xmm6 =
mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
vpcmpeqw %xmm2, %xmm3, %xmm3
vpmovzxwd %xmm3, %ymm3 # ymm3 =
xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
vpslld $24, %ymm3, %ymm3
vpcmpeqw %xmm2, %xmm4, %xmm4
vpmovzxwd %xmm4, %ymm4 # ymm4 =
xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
vpslld $24, %ymm4, %ymm4
vpcmpeqw %xmm2, %xmm5, %xmm5
vpmovzxwd %xmm5, %ymm5 # ymm5 =
xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
vpslld $24, %ymm5, %ymm5
vpcmpeqw %xmm2, %xmm6, %xmm6
vpmovzxwd %xmm6, %ymm6 # ymm6 =
xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
vpslld $24, %ymm6, %ymm6
vblendvps %ymm3, %ymm0, %ymm1, %ymm3
vblendvps %ymm4, %ymm0, %ymm1, %ymm4
vblendvps %ymm5, %ymm0, %ymm1, %ymm5
vblendvps %ymm6, %ymm0, %ymm1, %ymm6
vmovdqu (%rdi,%rcx,4), %ymm7
vpsllvd %ymm3, %ymm7, %ymm3
vmovdqu 32(%rdi,%rcx,4), %ymm7
vpsllvd %ymm4, %ymm7, %ymm4
vmovdqu 64(%rdi,%rcx,4), %ymm7
vpsllvd %ymm5, %ymm7, %ymm5
vmovdqu 96(%rdi,%rcx,4), %ymm7
vpsllvd %ymm6, %ymm7, %ymm6
vmovdqu %ymm3, (%rdi,%rcx,4)
vmovdqu %ymm4, 32(%rdi,%rcx,4)
vmovdqu %ymm5, 64(%rdi,%rcx,4)
vmovdqu %ymm6, 96(%rdi,%rcx,4)
addq $32, %rcx
cmpq %rcx, %rdx
jne .LBB0_4
The intermediate stage of converting to words doesn't seem to be helping, it
just results in extra work.
This should just do 4x vpmovzxbd for the "control" loads, then 4x vpcmpeqd to
set up the blend masks, I think.</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>