<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Inefficient AVX2 code generation with combination of permute and shuffle intrinsics"

   href="https://bugs.llvm.org/show_bug.cgi?id=40146">40146</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Inefficient AVX2 code generation with combination of permute and shuffle intrinsics

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>clang

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>unspecified

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>LLVM Codegen

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedclangbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>svenk.public@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Clang since version 5.0 produces very inefficient code for the following

function:

#include <x86intrin.h>

#include <stdint.h>

void test(uint32_t *bg)

{

    __m256i vbg = _mm256_loadu_si256((__m256i*)bg);

    const __m256i K = _mm256_setr_epi64x(

        0x8003800280018000,

        0x8007800680058004,

        0x8003800280018000,

        0x8007800680058004);

    __m256i lo_bg;

    lo_bg = _mm256_permute4x64_epi64(vbg, 0x10);

    lo_bg = _mm256_shuffle_epi8(lo_bg, K);

    _mm256_storeu_si256((__m256i*)bg, lo_bg);

}

Compile options: -O3 -mavx2

I would expect an almost one-to-one mapping of the intrinsics to the generated

code, nevertheless the resulting assembly looks like this:

test(unsigned int*): # @test(unsigned int*)

  push rbp

  push rbx

  mov r8, rdi

  vmovdqu ymm0, ymmword ptr [r8]

  vpextrq rcx, xmm0, 1

  mov rax, rcx

  mov edx, ecx

  mov esi, ecx

  movzx ebp, ch # NOREX

  vpxor xmm1, xmm1, xmm1

  vpinsrb xmm2, xmm1, ecx, 0

  mov rbx, rcx

  shr rcx, 32

  movzx edi, ch # NOREX

  shr edx, 24

  shr esi, 16

  vpinsrb xmm2, xmm2, ebp, 2

  vpinsrb xmm2, xmm2, esi, 4

  vpinsrb xmm2, xmm2, edx, 6

  vpinsrb xmm2, xmm2, ecx, 8

  vmovq rcx, xmm0

  mov rdx, rcx

  mov esi, ecx

  movzx ebp, ch # NOREX

  vpinsrb xmm0, xmm1, ecx, 0

  vpinsrb xmm0, xmm0, ebp, 2

  mov ebp, ecx

  shr ebp, 16

  vpinsrb xmm0, xmm0, ebp, 4

  mov rbp, rcx

  shr rcx, 32

  shr esi, 24

  vpinsrb xmm0, xmm0, esi, 6

  movzx esi, ch # NOREX

  vpinsrb xmm0, xmm0, ecx, 8

  vpinsrb xmm1, xmm2, edi, 10

  vpinsrb xmm0, xmm0, esi, 10

  shr rax, 48

  vpinsrb xmm1, xmm1, eax, 12

  shr rbx, 56

  vpinsrb xmm1, xmm1, ebx, 14

  shr rdx, 48

  vpinsrb xmm0, xmm0, edx, 12

  shr rbp, 56

  vpinsrb xmm0, xmm0, ebp, 14

  vinserti128 ymm0, ymm0, xmm1, 1

  vmovdqu ymmword ptr [r8], ymm0

  pop rbx

  pop rbp

  vzeroupper

  ret

GCC 8.2 produces a much better output:

test(unsigned int*):

  vmovdqu xmm1, XMMWORD PTR [rdi]

  vinserti128 ymm0, ymm1, XMMWORD PTR [rdi+16], 0x1

  vpermq ymm0, ymm0, 16

  vpshufb ymm0, ymm0, YMMWORD PTR .LC0[rip]

  vmovups XMMWORD PTR [rdi], xmm0

  vextracti128 XMMWORD PTR [rdi+16], ymm0, 0x1

  vzeroupper

  ret</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>