<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Wrong code generated with -fslp-vectorize"

   href="https://bugs.llvm.org/show_bug.cgi?id=50338">50338</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Wrong code generated with -fslp-vectorize

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Scalar Optimizations

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>kazu@google.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>I'm seeing a miscompilation triggered by -flp-vectorize, much like

PR50323.  Unlike PR50323, this one is reproducible as of the latest

clang as of c82a0ae70e280c1c40b1af09ef275ddc7ed4254d.

Consider:

#include <immintrin.h>

#include <stdio.h>

struct Float64x8 {

  __m256d ymm_lo;

  __m256d ymm_hi;

};

static inline double ExtractFloat64x8(Float64x8 a, int index) {

  return (index < 4) ? a.ymm_lo[index] : a.ymm_hi[index - 4];

}

// This function appears to be miscompiled.

__attribute__((noinline)) __m256d Permute(Float64x8 a, __m256i map) {

  double result[4];

  for (int i = 0; i < 4; ++i)

    result[i] = ExtractFloat64x8(a, map[i] & 0x7);

  return _mm256_loadu_pd(result);

}

int main() {

  Float64x8 v;

  v.ymm_lo = _mm256_set_pd(3.0, 2.0, 1.0, 0.0);

  v.ymm_hi = _mm256_set_pd(7.0, 6.0, 5.0, 4.0);

  __m256d r = Permute(v, _mm256_set_epi64x(1, 0, 0, 0));

  fprintf(stderr, "%f %f %f %f\n", r[3], r[2], r[1], r[0]);

  return 0;

}

With the latest clang (c82a0ae70e280c1c40b1af09ef275ddc7ed4254d), I see:

$ ./debug/bin/clang++ -O3 -mavx -fno-slp-vectorize permute2.cc ; ./a.out

1.000000 0.000000 0.000000 0.000000

$ ./debug/bin/clang++ -O3 -mavx -fslp-vectorize permute2.cc ; ./a.out

0.000000 2.000000 0.000000 0.000000

Notice that the top two lanes are different.  The former output is

correct.  The latter is not.

Here is the assembly output for Permute with -fslp-vectorize:

        .text

        .file   "permute2.cc"

        .section        .rodata.cst16,"aM",@progbits,16

        .p2align        4                               # -- Begin function

_Z7Permute9Float64x8Dv4_x

.LCPI0_0:

        .long   7                               # 0x7

        .long   7                               # 0x7

        .zero   4

        .zero   4

.LCPI0_1:

        .long   4                               # 0x4

        .long   4                               # 0x4

        .zero   4

        .zero   4

        .section        .rodata.cst32,"aM",@progbits,32

        .p2align        5

.LCPI0_2:

        .quad   2                               # 0x2

        .quad   2                               # 0x2

        .quad   2                               # 0x2

        .quad   2                               # 0x2

        .text

        .globl  _Z7Permute9Float64x8Dv4_x

        .p2align        4, 0x90

        .type   _Z7Permute9Float64x8Dv4_x,@function

_Z7Permute9Float64x8Dv4_x:              # @_Z7Permute9Float64x8Dv4_x

        .cfi_startproc

# %bb.0:                                # %entry

        pushq   %rbp

        .cfi_def_cfa_offset 16

        .cfi_offset %rbp, -16

        movq    %rsp, %rbp

        .cfi_def_cfa_register %rbp

        andq    $-32, %rsp

        subq    $96, %rsp

        vpextrd $2, %xmm0, %eax

        movl    %eax, %edx

        andl    $7, %edx

        leal    -4(%rdx), %ecx

        andl    $3, %eax

        vmovaps 16(%rbp), %ymm1

        vmovaps 48(%rbp), %ymm2

        vmovaps %ymm1, 32(%rsp)

        vmovsd  32(%rsp,%rax,8), %xmm1          # xmm1 = mem[0],zero

        vmovd   %xmm0, %eax

        vmovaps %ymm2, (%rsp)

        cmpl    $4, %edx

        jb      .LBB0_2

# %bb.1:                                # %entry

        andl    $3, %ecx

        vmovsd  (%rsp,%rcx,8), %xmm1            # xmm1 = mem[0],zero

.LBB0_2:                                # %entry

        movl    %eax, %ecx

        andl    $7, %ecx

        subl    $4, %ecx

        jb      .LBB0_3

# %bb.4:                                # %entry

        andl    $3, %ecx

        vmovsd  (%rsp,%rcx,8), %xmm2            # xmm2 = mem[0],zero

        jmp     .LBB0_5

.LBB0_3:

        andl    $3, %eax

        vmovsd  32(%rsp,%rax,8), %xmm2          # xmm2 = mem[0],zero

.LBB0_5:                                # %entry

        vmovlhps        %xmm1, %xmm2, %xmm1             # xmm1 =

xmm2[0],xmm1[0]

        vextractf128    $1, %ymm0, %xmm0

        vpshufd $216, %xmm0, %xmm2              # xmm2 = xmm0[0,2,1,3]

        vpand   .LCPI0_0(%rip), %xmm2, %xmm2

        vmovd   %xmm0, %eax

        addl    $-4, %eax

        vpmaxud .LCPI0_1(%rip), %xmm2, %xmm3

        andl    $3, %eax

        vpcmpeqd        %xmm3, %xmm2, %xmm3

        vpcmpeqd        %xmm4, %xmm4, %xmm4

        vpxor   %xmm4, %xmm3, %xmm3

        vpextrd $2, %xmm0, %ecx

        addl    $-4, %ecx

        vpaddq  %xmm2, %xmm2, %xmm0

        vmovapd 16(%rbp), %xmm2

        vmovapd 32(%rbp), %xmm4

        vpermilpd       %xmm0, %xmm4, %xmm4

        vpermilpd       %xmm0, %xmm2, %xmm2

        vpcmpgtq        .LCPI0_2(%rip), %xmm0, %xmm0

        vblendvpd       %xmm0, %xmm4, %xmm2, %xmm0

        andl    $3, %ecx

        vmovsd  (%rsp,%rax,8), %xmm2            # xmm2 = mem[0],zero

        vmovhpd (%rsp,%rcx,8), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]

        vpmovsxdq       %xmm3, %xmm3

        vblendvpd       %xmm3, %xmm0, %xmm2, %xmm0

        vinsertf128     $1, %xmm0, %ymm1, %ymm0

        movq    %rbp, %rsp

        popq    %rbp

        .cfi_def_cfa %rsp, 8

        retq</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>