<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Wrong code generated with -fslp-vectorize"

   href="https://bugs.llvm.org/show_bug.cgi?id=50356">50356</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Wrong code generated with -fslp-vectorize

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>kazu@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>I'm seeing a miscompilation triggered by -fslp-vectorize.  This is

similar and may be related to PR50323 and PR50338.

Consider:

#include <immintrin.h>

#include <stdio.h>

#include <stdint.h>

#include <inttypes.h>

struct Int64x8 {

  __m256i ymm_lo;

  __m256i ymm_hi;

};

static inline int64_t Extract(Int64x8 a, int index) {

  return (index < 4) ? a.ymm_lo[index] : a.ymm_hi[index - 4];

}

// This function appears to be miscompiled with -flp-vectorize.

__attribute__((noinline)) __m256i Permute(Int64x8 a, __m256i b, __m256i c) {

  int64_t d[4];

  d[0] = Extract(a, b[0] & 0x7);

  d[1] = Extract(a, b[1] & 0x7);

  d[2] = Extract(a, b[2] & 0x7);

  d[3] = Extract(a, b[3] & 0x7);

  __m256i e = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(d));

  int64_t f[4];

  f[0] = e[0] < c[0] ? -1 : 0;

  f[1] = e[1] < c[1] ? -1 : 0;

  f[2] = e[2] < c[2] ? -1 : 0;

  f[3] = e[3] < c[3] ? -1 : 0;

  return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(f));

}

int main() {

  Int64x8 a;

  a.ymm_lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);

  a.ymm_hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);

  __m256i m = Permute(a,

                      _mm256_set_epi64x(0, 0, 1, 0),

                      _mm256_set_epi64x(0, 0, 0, 0xa1));

  printf("%016" PRIx64 " %016" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n",

         (int64_t)m[3], (int64_t)m[2], (int64_t)m[1], (int64_t)m[0]);

  return 0;

}

With the latest clang (4b91f96a3e291db1ea6360c9a842ecbc6ee89d67), I see:

$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute3.cc ; ./a.out

0000000000000000 0000000000000000 0000000000000000 ffffffffffffffff

$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize    permute3.cc ; ./a.out

0000000000000000 0000000000000000 0000000000000000 0000000000000000

Notice that the bottom lane is different in the output.

Here is the assembly output for Permute:

        .text

        .file   "permute3.cc"

        .section        .rodata.cst16,"aM",@progbits,16

        .p2align        4                               # -- Begin function

_Z7Permute7Int64x8Dv4_xS0_

.LCPI0_0:

        .long   7                               # 0x7

        .long   7                               # 0x7

        .long   7                               # 0x7

        .long   7                               # 0x7

.LCPI0_1:

        .long   4                               # 0x4

        .long   4                               # 0x4

        .long   4                               # 0x4

        .long   4                               # 0x4

        .section        .rodata.cst32,"aM",@progbits,32

        .p2align        5

.LCPI0_2:

        .quad   2                               # 0x2

        .quad   2                               # 0x2

        .quad   2                               # 0x2

        .quad   2                               # 0x2

        .text

        .globl  _Z7Permute7Int64x8Dv4_xS0_

        .p2align        4, 0x90

        .type   _Z7Permute7Int64x8Dv4_xS0_,@function

_Z7Permute7Int64x8Dv4_xS0_:             # @_Z7Permute7Int64x8Dv4_xS0_

        .cfi_startproc

# %bb.0:

        pushq   %rbp

        .cfi_def_cfa_offset 16

        .cfi_offset %rbp, -16

        movq    %rsp, %rbp

        .cfi_def_cfa_register %rbp

        andq    $-32, %rsp

        subq    $96, %rsp

        vmovaps 16(%rbp), %ymm3

        vmovaps 48(%rbp), %ymm4

        vextractf128    $1, %ymm0, %xmm2

        vshufps $136, %xmm2, %xmm0, %xmm2       # xmm2 = xmm0[0,2],xmm2[0,2]

        vandps  .LCPI0_0(%rip), %xmm2, %xmm2

        vmovd   %xmm0, %eax

        addl    $-4, %eax

        vmovaps %ymm4, (%rsp)

        andl    $3, %eax

        vpextrd $2, %xmm0, %edx

        addl    $-4, %edx

        andl    $3, %edx

        vextractps      $2, %xmm2, %ecx

        vmovaps %ymm3, 32(%rsp)

        movl    %ecx, %r8d

        andl    $3, %r8d

        addl    $-4, %ecx

        andl    $3, %ecx

        vmovdqa .LCPI0_1(%rip), %xmm0           # xmm0 = [4,4,4,4]

        vpcmpgtd        %xmm2, %xmm0, %xmm3

        vpshufd $238, %xmm3, %xmm0              # xmm0 = xmm3[2,3,2,3]

        vpmovsxdq       %xmm0, %xmm0

        vpmovsxdq       %xmm3, %xmm3

        vpextrd $3, %xmm2, %edi

        movl    %edi, %esi

        andl    $3, %esi

        addl    $-4, %edi

        vpaddq  %xmm2, %xmm2, %xmm2

        vmovapd 16(%rbp), %xmm4

        vmovapd 32(%rbp), %xmm5

        vpermilpd       %xmm2, %xmm5, %xmm5

        vpermilpd       %xmm2, %xmm4, %xmm4

        vpcmpgtq        .LCPI0_2(%rip), %xmm2, %xmm2

        andl    $3, %edi

        vblendvpd       %xmm2, %xmm5, %xmm4, %xmm2

        vmovsd  (%rsp,%rdx,8), %xmm4            # xmm4 = mem[0],zero

        vmovsd  (%rsp,%rax,8), %xmm5            # xmm5 = mem[0],zero

        vunpcklpd       %xmm4, %xmm5, %xmm4     # xmm4 = xmm5[0],xmm4[0]

        vblendvpd       %xmm3, %xmm2, %xmm4, %xmm2

        vmovsd  32(%rsp,%rsi,8), %xmm3          # xmm3 = mem[0],zero

        vmovsd  32(%rsp,%r8,8), %xmm4           # xmm4 = mem[0],zero

        vunpcklpd       %xmm3, %xmm4, %xmm3     # xmm3 = xmm4[0],xmm3[0]

        vmovsd  (%rsp,%rdi,8), %xmm4            # xmm4 = mem[0],zero

        vmovsd  (%rsp,%rcx,8), %xmm5            # xmm5 = mem[0],zero

        vunpcklpd       %xmm4, %xmm5, %xmm4     # xmm4 = xmm5[0],xmm4[0]

        vblendvpd       %xmm0, %xmm3, %xmm4, %xmm0

        vpcmpgtq        %xmm2, %xmm1, %xmm2

        vextractf128    $1, %ymm1, %xmm1

        vpcmpgtq        %xmm0, %xmm1, %xmm0

        vinsertf128     $1, %xmm0, %ymm2, %ymm0

        movq    %rbp, %rsp

        popq    %rbp

        .cfi_def_cfa %rsp, 8

        retq</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>