[llvm-bugs] [Bug 50338] New: Wrong code generated with -fslp-vectorize

Thu May 13 23:48:17 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=50338

            Bug ID: 50338
           Summary: Wrong code generated with -fslp-vectorize
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: kazu at google.com
                CC: llvm-bugs at lists.llvm.org

I'm seeing a miscompilation triggered by -flp-vectorize, much like
PR50323.  Unlike PR50323, this one is reproducible as of the latest
clang as of c82a0ae70e280c1c40b1af09ef275ddc7ed4254d.

Consider:

#include <immintrin.h>
#include <stdio.h>

struct Float64x8 {
  __m256d ymm_lo;
  __m256d ymm_hi;
};

static inline double ExtractFloat64x8(Float64x8 a, int index) {
  return (index < 4) ? a.ymm_lo[index] : a.ymm_hi[index - 4];
}

// This function appears to be miscompiled.
__attribute__((noinline)) __m256d Permute(Float64x8 a, __m256i map) {
  double result[4];
  for (int i = 0; i < 4; ++i)
    result[i] = ExtractFloat64x8(a, map[i] & 0x7);
  return _mm256_loadu_pd(result);
}

int main() {
  Float64x8 v;
  v.ymm_lo = _mm256_set_pd(3.0, 2.0, 1.0, 0.0);
  v.ymm_hi = _mm256_set_pd(7.0, 6.0, 5.0, 4.0);
  __m256d r = Permute(v, _mm256_set_epi64x(1, 0, 0, 0));
  fprintf(stderr, "%f %f %f %f\n", r[3], r[2], r[1], r[0]);
  return 0;
}

With the latest clang (c82a0ae70e280c1c40b1af09ef275ddc7ed4254d), I see:

$ ./debug/bin/clang++ -O3 -mavx -fno-slp-vectorize permute2.cc ; ./a.out
1.000000 0.000000 0.000000 0.000000
$ ./debug/bin/clang++ -O3 -mavx -fslp-vectorize permute2.cc ; ./a.out
0.000000 2.000000 0.000000 0.000000

Notice that the top two lanes are different.  The former output is
correct.  The latter is not.

Here is the assembly output for Permute with -fslp-vectorize:

        .text
        .file   "permute2.cc"
        .section        .rodata.cst16,"aM", at progbits,16
        .p2align        4                               # -- Begin function
_Z7Permute9Float64x8Dv4_x
.LCPI0_0:
        .long   7                               # 0x7
        .long   7                               # 0x7
        .zero   4
        .zero   4
.LCPI0_1:
        .long   4                               # 0x4
        .long   4                               # 0x4
        .zero   4
        .zero   4
        .section        .rodata.cst32,"aM", at progbits,32
        .p2align        5
.LCPI0_2:
        .quad   2                               # 0x2
        .quad   2                               # 0x2
        .quad   2                               # 0x2
        .quad   2                               # 0x2
        .text
        .globl  _Z7Permute9Float64x8Dv4_x
        .p2align        4, 0x90
        .type   _Z7Permute9Float64x8Dv4_x, at function
_Z7Permute9Float64x8Dv4_x:              # @_Z7Permute9Float64x8Dv4_x
        .cfi_startproc
# %bb.0:                                # %entry
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        andq    $-32, %rsp
        subq    $96, %rsp
        vpextrd $2, %xmm0, %eax
        movl    %eax, %edx
        andl    $7, %edx
        leal    -4(%rdx), %ecx
        andl    $3, %eax
        vmovaps 16(%rbp), %ymm1
        vmovaps 48(%rbp), %ymm2
        vmovaps %ymm1, 32(%rsp)
        vmovsd  32(%rsp,%rax,8), %xmm1          # xmm1 = mem[0],zero
        vmovd   %xmm0, %eax
        vmovaps %ymm2, (%rsp)
        cmpl    $4, %edx
        jb      .LBB0_2
# %bb.1:                                # %entry
        andl    $3, %ecx
        vmovsd  (%rsp,%rcx,8), %xmm1            # xmm1 = mem[0],zero
.LBB0_2:                                # %entry
        movl    %eax, %ecx
        andl    $7, %ecx
        subl    $4, %ecx
        jb      .LBB0_3
# %bb.4:                                # %entry
        andl    $3, %ecx
        vmovsd  (%rsp,%rcx,8), %xmm2            # xmm2 = mem[0],zero
        jmp     .LBB0_5
.LBB0_3:
        andl    $3, %eax
        vmovsd  32(%rsp,%rax,8), %xmm2          # xmm2 = mem[0],zero
.LBB0_5:                                # %entry
        vmovlhps        %xmm1, %xmm2, %xmm1             # xmm1 =
xmm2[0],xmm1[0]
        vextractf128    $1, %ymm0, %xmm0
        vpshufd $216, %xmm0, %xmm2              # xmm2 = xmm0[0,2,1,3]
        vpand   .LCPI0_0(%rip), %xmm2, %xmm2
        vmovd   %xmm0, %eax
        addl    $-4, %eax
        vpmaxud .LCPI0_1(%rip), %xmm2, %xmm3
        andl    $3, %eax
        vpcmpeqd        %xmm3, %xmm2, %xmm3
        vpcmpeqd        %xmm4, %xmm4, %xmm4
        vpxor   %xmm4, %xmm3, %xmm3
        vpextrd $2, %xmm0, %ecx
        addl    $-4, %ecx
        vpaddq  %xmm2, %xmm2, %xmm0
        vmovapd 16(%rbp), %xmm2
        vmovapd 32(%rbp), %xmm4
        vpermilpd       %xmm0, %xmm4, %xmm4
        vpermilpd       %xmm0, %xmm2, %xmm2
        vpcmpgtq        .LCPI0_2(%rip), %xmm0, %xmm0
        vblendvpd       %xmm0, %xmm4, %xmm2, %xmm0
        andl    $3, %ecx
        vmovsd  (%rsp,%rax,8), %xmm2            # xmm2 = mem[0],zero
        vmovhpd (%rsp,%rcx,8), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
        vpmovsxdq       %xmm3, %xmm3
        vblendvpd       %xmm3, %xmm0, %xmm2, %xmm0
        vinsertf128     $1, %xmm0, %ymm1, %ymm0
        movq    %rbp, %rsp
        popq    %rbp
        .cfi_def_cfa %rsp, 8
        retq

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210514/4c1e3aaf/attachment-0001.html>