<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Wrong code generated with -fslp-vectorize"

   href="https://bugs.llvm.org/show_bug.cgi?id=50323">50323</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Wrong code generated with -fslp-vectorize

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>kazu@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>craig.topper@gmail.com, llvm-bugs@lists.llvm.org, llvm-dev@redking.me.uk, pengfei.wang@intel.com, spatel+llvm@rotateright.com

          </td>

        </tr></table>

      <p>

        <div>

        <pre>I'm seeing a miscompilation triggered by -fslp-vectorize.

Consider:

#include <stdint.h>

#include <immintrin.h>

#include <stdio.h>

struct Pair {

  __m256i lo;

  __m256i hi;

};

static inline int64_t Extract(const Pair& v, int index) {

  return index < 4 ? v.lo[index] : v.hi[index - 4];

}

// This function gets miscompiled.  This is a lot like

_mm512_permutexvar_epi64.

// It takes a pair of __m256i along with an array of indexes and returns a

// permutation in __m256i.

__attribute__((noinline)) __m256i Permute(Pair a, __m256i map) {

  int64_t result[] = {0, 0, 0, 0};

  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo);

  result[0] = Extract(a, map[0] & 0x7);

  result[1] = Extract(a, map[1] & 0x7);

  result[2] = Extract(a, map[2] & 0x7);

  result[3] = Extract(a, map[3] & 0x7);

  return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result));

}

int main() {

  Pair v;

  v.lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);

  v.hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);

  __m256i r = Permute(v, _mm256_set_epi64x(2, 3, 4, 5));

  printf("%02x %02x %02x %02x\n",

         (int)_mm256_extract_epi64(r, 3),

         (int)_mm256_extract_epi64(r, 2),

         (int)_mm256_extract_epi64(r, 1),

         (int)_mm256_extract_epi64(r, 0));

  return 0;

}

With the latest clang (a0ca4c46ca35957a38a6023fa84afda2fc9ba0ec), I see:

$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute.cc ; ./a.out

a2 a3 a4 a5

$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize    permute.cc ; ./a.out

a0 a3 a4 a5

Notice that the top lane is different -- 0xa0 v.s. 0xa2.

Here is the assembly output for Permute:

        .text

        .file   "permute.cc"

        .globl  _Z7Permute4PairDv4_x            # -- Begin function

_Z7Permute4PairDv4_x

        .p2align        4, 0x90

        .type   _Z7Permute4PairDv4_x,@function

_Z7Permute4PairDv4_x:                   # @_Z7Permute4PairDv4_x

        .cfi_startproc

# %bb.0:

        pushq   %rbp

        .cfi_def_cfa_offset 16

        .cfi_offset %rbp, -16

        movq    %rsp, %rbp

        .cfi_def_cfa_register %rbp

        andq    $-32, %rsp

        subq    $96, %rsp

        vextractf128    $1, %ymm0, %xmm1

        vpextrd $2, %xmm1, %eax

        movl    %eax, %edx

        andl    $7, %edx

        vmovaps 16(%rbp), %ymm2

        vmovaps 48(%rbp), %ymm3

        vmovaps %ymm2, (%rsp)

        vmovaps %ymm3, 32(%rsp)

        vmovd   %xmm1, %ecx

        subl    $4, %edx

        jae     .LBB0_1

# %bb.2:

        andl    $3, %eax

        movq    (%rsp,%rax,8), %rax

        jmp     .LBB0_3

.LBB0_1:

        andl    $3, %edx

        movq    32(%rsp,%rdx,8), %rax

.LBB0_3:

        movl    %ecx, %esi

        andl    $7, %esi

        vpextrd $2, %xmm0, %edx

        subl    $4, %esi

        jae     .LBB0_4

# %bb.5:

        andl    $3, %ecx

        movq    (%rsp,%rcx,8), %rcx

        jmp     .LBB0_6

.LBB0_4:

        andl    $3, %esi

        movq    32(%rsp,%rsi,8), %rcx

.LBB0_6:

        movl    %edx, %edi

        andl    $7, %edi

        vmovd   %xmm0, %esi

        subl    $4, %edi

        jae     .LBB0_7

# %bb.8:

        andl    $3, %edx

        movq    (%rsp,%rdx,8), %rdx

        movl    %esi, %edi

        andl    $7, %edi

        subl    $4, %edi

        jb      .LBB0_11

.LBB0_10:

        andl    $3, %edi

        movq    32(%rsp,%rdi,8), %rsi

        jmp     .LBB0_12

.LBB0_7:

        andl    $3, %edi

        movq    32(%rsp,%rdi,8), %rdx

        movl    %esi, %edi

        andl    $7, %edi

        subl    $4, %edi

        jae     .LBB0_10

.LBB0_11:

        andl    $3, %esi

        movq    (%rsp,%rsi,8), %rsi

.LBB0_12:

        vmovq   %rax, %xmm0

        vmovq   %rcx, %xmm1

        vpunpcklqdq     %xmm0, %xmm1, %xmm0     # xmm0 = xmm1[0],xmm0[0]

        vmovq   %rdx, %xmm1

        vmovq   %rsi, %xmm2

        vpunpcklqdq     %xmm1, %xmm2, %xmm1     # xmm1 = xmm2[0],xmm1[0]

        vinsertf128     $1, %xmm0, %ymm1, %ymm0

        movq    %rbp, %rsp

        popq    %rbp

        .cfi_def_cfa %rsp, 8

        retq</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>