[llvm-bugs] [Bug 40146] New: Inefficient AVX2 code generation with combination of permute and shuffle intrinsics

Sun Dec 23 06:04:26 PST 2018

https://bugs.llvm.org/show_bug.cgi?id=40146

            Bug ID: 40146
           Summary: Inefficient AVX2 code generation with combination of
                    permute and shuffle intrinsics
           Product: clang
           Version: unspecified
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: LLVM Codegen
          Assignee: unassignedclangbugs at nondot.org
          Reporter: svenk.public at gmail.com
                CC: llvm-bugs at lists.llvm.org, neeilans at live.com,
                    richard-llvm at metafoo.co.uk

Clang since version 5.0 produces very inefficient code for the following
function:

#include <x86intrin.h>
#include <stdint.h>

void test(uint32_t *bg)
{
    __m256i vbg = _mm256_loadu_si256((__m256i*)bg);

    const __m256i K = _mm256_setr_epi64x(
        0x8003800280018000,
        0x8007800680058004,
        0x8003800280018000,
        0x8007800680058004);

    __m256i lo_bg;

    lo_bg = _mm256_permute4x64_epi64(vbg, 0x10);
    lo_bg = _mm256_shuffle_epi8(lo_bg, K);

    _mm256_storeu_si256((__m256i*)bg, lo_bg);
}

Compile options: -O3 -mavx2

I would expect an almost one-to-one mapping of the intrinsics to the generated
code, nevertheless the resulting assembly looks like this:
test(unsigned int*): # @test(unsigned int*)
  push rbp
  push rbx
  mov r8, rdi
  vmovdqu ymm0, ymmword ptr [r8]
  vpextrq rcx, xmm0, 1
  mov rax, rcx
  mov edx, ecx
  mov esi, ecx
  movzx ebp, ch # NOREX
  vpxor xmm1, xmm1, xmm1
  vpinsrb xmm2, xmm1, ecx, 0
  mov rbx, rcx
  shr rcx, 32
  movzx edi, ch # NOREX
  shr edx, 24
  shr esi, 16
  vpinsrb xmm2, xmm2, ebp, 2
  vpinsrb xmm2, xmm2, esi, 4
  vpinsrb xmm2, xmm2, edx, 6
  vpinsrb xmm2, xmm2, ecx, 8
  vmovq rcx, xmm0
  mov rdx, rcx
  mov esi, ecx
  movzx ebp, ch # NOREX
  vpinsrb xmm0, xmm1, ecx, 0
  vpinsrb xmm0, xmm0, ebp, 2
  mov ebp, ecx
  shr ebp, 16
  vpinsrb xmm0, xmm0, ebp, 4
  mov rbp, rcx
  shr rcx, 32
  shr esi, 24
  vpinsrb xmm0, xmm0, esi, 6
  movzx esi, ch # NOREX
  vpinsrb xmm0, xmm0, ecx, 8
  vpinsrb xmm1, xmm2, edi, 10
  vpinsrb xmm0, xmm0, esi, 10
  shr rax, 48
  vpinsrb xmm1, xmm1, eax, 12
  shr rbx, 56
  vpinsrb xmm1, xmm1, ebx, 14
  shr rdx, 48
  vpinsrb xmm0, xmm0, edx, 12
  shr rbp, 56
  vpinsrb xmm0, xmm0, ebp, 14
  vinserti128 ymm0, ymm0, xmm1, 1
  vmovdqu ymmword ptr [r8], ymm0
  pop rbx
  pop rbp
  vzeroupper
  ret

GCC 8.2 produces a much better output:
test(unsigned int*):
  vmovdqu xmm1, XMMWORD PTR [rdi]
  vinserti128 ymm0, ymm1, XMMWORD PTR [rdi+16], 0x1
  vpermq ymm0, ymm0, 16
  vpshufb ymm0, ymm0, YMMWORD PTR .LC0[rip]
  vmovups XMMWORD PTR [rdi], xmm0
  vextracti128 XMMWORD PTR [rdi+16], ymm0, 0x1
  vzeroupper
  ret

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20181223/b1c0e556/attachment.html>