[llvm-bugs] [Bug 40146] New: Inefficient AVX2 code generation with combination of permute and shuffle intrinsics
via llvm-bugs
llvm-bugs at lists.llvm.org
Sun Dec 23 06:04:26 PST 2018
https://bugs.llvm.org/show_bug.cgi?id=40146
Bug ID: 40146
Summary: Inefficient AVX2 code generation with combination of
permute and shuffle intrinsics
Product: clang
Version: unspecified
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: LLVM Codegen
Assignee: unassignedclangbugs at nondot.org
Reporter: svenk.public at gmail.com
CC: llvm-bugs at lists.llvm.org, neeilans at live.com,
richard-llvm at metafoo.co.uk
Clang since version 5.0 produces very inefficient code for the following
function:
#include <x86intrin.h>
#include <stdint.h>
void test(uint32_t *bg)
{
__m256i vbg = _mm256_loadu_si256((__m256i*)bg);
const __m256i K = _mm256_setr_epi64x(
0x8003800280018000,
0x8007800680058004,
0x8003800280018000,
0x8007800680058004);
__m256i lo_bg;
lo_bg = _mm256_permute4x64_epi64(vbg, 0x10);
lo_bg = _mm256_shuffle_epi8(lo_bg, K);
_mm256_storeu_si256((__m256i*)bg, lo_bg);
}
Compile options: -O3 -mavx2
I would expect an almost one-to-one mapping of the intrinsics to the generated
code, nevertheless the resulting assembly looks like this:
test(unsigned int*): # @test(unsigned int*)
push rbp
push rbx
mov r8, rdi
vmovdqu ymm0, ymmword ptr [r8]
vpextrq rcx, xmm0, 1
mov rax, rcx
mov edx, ecx
mov esi, ecx
movzx ebp, ch # NOREX
vpxor xmm1, xmm1, xmm1
vpinsrb xmm2, xmm1, ecx, 0
mov rbx, rcx
shr rcx, 32
movzx edi, ch # NOREX
shr edx, 24
shr esi, 16
vpinsrb xmm2, xmm2, ebp, 2
vpinsrb xmm2, xmm2, esi, 4
vpinsrb xmm2, xmm2, edx, 6
vpinsrb xmm2, xmm2, ecx, 8
vmovq rcx, xmm0
mov rdx, rcx
mov esi, ecx
movzx ebp, ch # NOREX
vpinsrb xmm0, xmm1, ecx, 0
vpinsrb xmm0, xmm0, ebp, 2
mov ebp, ecx
shr ebp, 16
vpinsrb xmm0, xmm0, ebp, 4
mov rbp, rcx
shr rcx, 32
shr esi, 24
vpinsrb xmm0, xmm0, esi, 6
movzx esi, ch # NOREX
vpinsrb xmm0, xmm0, ecx, 8
vpinsrb xmm1, xmm2, edi, 10
vpinsrb xmm0, xmm0, esi, 10
shr rax, 48
vpinsrb xmm1, xmm1, eax, 12
shr rbx, 56
vpinsrb xmm1, xmm1, ebx, 14
shr rdx, 48
vpinsrb xmm0, xmm0, edx, 12
shr rbp, 56
vpinsrb xmm0, xmm0, ebp, 14
vinserti128 ymm0, ymm0, xmm1, 1
vmovdqu ymmword ptr [r8], ymm0
pop rbx
pop rbp
vzeroupper
ret
GCC 8.2 produces a much better output:
test(unsigned int*):
vmovdqu xmm1, XMMWORD PTR [rdi]
vinserti128 ymm0, ymm1, XMMWORD PTR [rdi+16], 0x1
vpermq ymm0, ymm0, 16
vpshufb ymm0, ymm0, YMMWORD PTR .LC0[rip]
vmovups XMMWORD PTR [rdi], xmm0
vextracti128 XMMWORD PTR [rdi+16], ymm0, 0x1
vzeroupper
ret
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20181223/b1c0e556/attachment.html>
More information about the llvm-bugs
mailing list