<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Inefficient AVX2 code generation with combination of permute and shuffle intrinsics"
href="https://bugs.llvm.org/show_bug.cgi?id=40146">40146</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Inefficient AVX2 code generation with combination of permute and shuffle intrinsics
</td>
</tr>
<tr>
<th>Product</th>
<td>clang
</td>
</tr>
<tr>
<th>Version</th>
<td>unspecified
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>LLVM Codegen
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedclangbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>svenk.public@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk
</td>
</tr></table>
<p>
<div>
<pre>Clang since version 5.0 produces very inefficient code for the following
function:
#include <x86intrin.h>
#include <stdint.h>
void test(uint32_t *bg)
{
__m256i vbg = _mm256_loadu_si256((__m256i*)bg);
const __m256i K = _mm256_setr_epi64x(
0x8003800280018000,
0x8007800680058004,
0x8003800280018000,
0x8007800680058004);
__m256i lo_bg;
lo_bg = _mm256_permute4x64_epi64(vbg, 0x10);
lo_bg = _mm256_shuffle_epi8(lo_bg, K);
_mm256_storeu_si256((__m256i*)bg, lo_bg);
}
Compile options: -O3 -mavx2
I would expect an almost one-to-one mapping of the intrinsics to the generated
code, nevertheless the resulting assembly looks like this:
test(unsigned int*): # @test(unsigned int*)
push rbp
push rbx
mov r8, rdi
vmovdqu ymm0, ymmword ptr [r8]
vpextrq rcx, xmm0, 1
mov rax, rcx
mov edx, ecx
mov esi, ecx
movzx ebp, ch # NOREX
vpxor xmm1, xmm1, xmm1
vpinsrb xmm2, xmm1, ecx, 0
mov rbx, rcx
shr rcx, 32
movzx edi, ch # NOREX
shr edx, 24
shr esi, 16
vpinsrb xmm2, xmm2, ebp, 2
vpinsrb xmm2, xmm2, esi, 4
vpinsrb xmm2, xmm2, edx, 6
vpinsrb xmm2, xmm2, ecx, 8
vmovq rcx, xmm0
mov rdx, rcx
mov esi, ecx
movzx ebp, ch # NOREX
vpinsrb xmm0, xmm1, ecx, 0
vpinsrb xmm0, xmm0, ebp, 2
mov ebp, ecx
shr ebp, 16
vpinsrb xmm0, xmm0, ebp, 4
mov rbp, rcx
shr rcx, 32
shr esi, 24
vpinsrb xmm0, xmm0, esi, 6
movzx esi, ch # NOREX
vpinsrb xmm0, xmm0, ecx, 8
vpinsrb xmm1, xmm2, edi, 10
vpinsrb xmm0, xmm0, esi, 10
shr rax, 48
vpinsrb xmm1, xmm1, eax, 12
shr rbx, 56
vpinsrb xmm1, xmm1, ebx, 14
shr rdx, 48
vpinsrb xmm0, xmm0, edx, 12
shr rbp, 56
vpinsrb xmm0, xmm0, ebp, 14
vinserti128 ymm0, ymm0, xmm1, 1
vmovdqu ymmword ptr [r8], ymm0
pop rbx
pop rbp
vzeroupper
ret
GCC 8.2 produces a much better output:
test(unsigned int*):
vmovdqu xmm1, XMMWORD PTR [rdi]
vinserti128 ymm0, ymm1, XMMWORD PTR [rdi+16], 0x1
vpermq ymm0, ymm0, 16
vpshufb ymm0, ymm0, YMMWORD PTR .LC0[rip]
vmovups XMMWORD PTR [rdi], xmm0
vextracti128 XMMWORD PTR [rdi+16], ymm0, 0x1
vzeroupper
ret</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>