[llvm-bugs] [Bug 50356] New: Wrong code generated with -fslp-vectorize

Sat May 15 21:34:20 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=50356

            Bug ID: 50356
           Summary: Wrong code generated with -fslp-vectorize
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: kazu at google.com
                CC: llvm-bugs at lists.llvm.org

I'm seeing a miscompilation triggered by -fslp-vectorize.  This is
similar and may be related to PR50323 and PR50338.

Consider:

#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>

struct Int64x8 {
  __m256i ymm_lo;
  __m256i ymm_hi;
};

static inline int64_t Extract(Int64x8 a, int index) {
  return (index < 4) ? a.ymm_lo[index] : a.ymm_hi[index - 4];
}

// This function appears to be miscompiled with -flp-vectorize.
__attribute__((noinline)) __m256i Permute(Int64x8 a, __m256i b, __m256i c) {
  int64_t d[4];
  d[0] = Extract(a, b[0] & 0x7);
  d[1] = Extract(a, b[1] & 0x7);
  d[2] = Extract(a, b[2] & 0x7);
  d[3] = Extract(a, b[3] & 0x7);
  __m256i e = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(d));
  int64_t f[4];
  f[0] = e[0] < c[0] ? -1 : 0;
  f[1] = e[1] < c[1] ? -1 : 0;
  f[2] = e[2] < c[2] ? -1 : 0;
  f[3] = e[3] < c[3] ? -1 : 0;
  return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(f));
}

int main() {
  Int64x8 a;
  a.ymm_lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);
  a.ymm_hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);
  __m256i m = Permute(a,
                      _mm256_set_epi64x(0, 0, 1, 0),
                      _mm256_set_epi64x(0, 0, 0, 0xa1));
  printf("%016" PRIx64 " %016" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n",
         (int64_t)m[3], (int64_t)m[2], (int64_t)m[1], (int64_t)m[0]);
  return 0;
}

With the latest clang (4b91f96a3e291db1ea6360c9a842ecbc6ee89d67), I see:

$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute3.cc ; ./a.out
0000000000000000 0000000000000000 0000000000000000 ffffffffffffffff
$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize    permute3.cc ; ./a.out
0000000000000000 0000000000000000 0000000000000000 0000000000000000

Notice that the bottom lane is different in the output.

Here is the assembly output for Permute:

        .text
        .file   "permute3.cc"
        .section        .rodata.cst16,"aM", at progbits,16
        .p2align        4                               # -- Begin function
_Z7Permute7Int64x8Dv4_xS0_
.LCPI0_0:
        .long   7                               # 0x7
        .long   7                               # 0x7
        .long   7                               # 0x7
        .long   7                               # 0x7
.LCPI0_1:
        .long   4                               # 0x4
        .long   4                               # 0x4
        .long   4                               # 0x4
        .long   4                               # 0x4
        .section        .rodata.cst32,"aM", at progbits,32
        .p2align        5
.LCPI0_2:
        .quad   2                               # 0x2
        .quad   2                               # 0x2
        .quad   2                               # 0x2
        .quad   2                               # 0x2
        .text
        .globl  _Z7Permute7Int64x8Dv4_xS0_
        .p2align        4, 0x90
        .type   _Z7Permute7Int64x8Dv4_xS0_, at function
_Z7Permute7Int64x8Dv4_xS0_:             # @_Z7Permute7Int64x8Dv4_xS0_
        .cfi_startproc
# %bb.0:
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        andq    $-32, %rsp
        subq    $96, %rsp
        vmovaps 16(%rbp), %ymm3
        vmovaps 48(%rbp), %ymm4
        vextractf128    $1, %ymm0, %xmm2
        vshufps $136, %xmm2, %xmm0, %xmm2       # xmm2 = xmm0[0,2],xmm2[0,2]
        vandps  .LCPI0_0(%rip), %xmm2, %xmm2
        vmovd   %xmm0, %eax
        addl    $-4, %eax
        vmovaps %ymm4, (%rsp)
        andl    $3, %eax
        vpextrd $2, %xmm0, %edx
        addl    $-4, %edx
        andl    $3, %edx
        vextractps      $2, %xmm2, %ecx
        vmovaps %ymm3, 32(%rsp)
        movl    %ecx, %r8d
        andl    $3, %r8d
        addl    $-4, %ecx
        andl    $3, %ecx
        vmovdqa .LCPI0_1(%rip), %xmm0           # xmm0 = [4,4,4,4]
        vpcmpgtd        %xmm2, %xmm0, %xmm3
        vpshufd $238, %xmm3, %xmm0              # xmm0 = xmm3[2,3,2,3]
        vpmovsxdq       %xmm0, %xmm0
        vpmovsxdq       %xmm3, %xmm3
        vpextrd $3, %xmm2, %edi
        movl    %edi, %esi
        andl    $3, %esi
        addl    $-4, %edi
        vpaddq  %xmm2, %xmm2, %xmm2
        vmovapd 16(%rbp), %xmm4
        vmovapd 32(%rbp), %xmm5
        vpermilpd       %xmm2, %xmm5, %xmm5
        vpermilpd       %xmm2, %xmm4, %xmm4
        vpcmpgtq        .LCPI0_2(%rip), %xmm2, %xmm2
        andl    $3, %edi
        vblendvpd       %xmm2, %xmm5, %xmm4, %xmm2
        vmovsd  (%rsp,%rdx,8), %xmm4            # xmm4 = mem[0],zero
        vmovsd  (%rsp,%rax,8), %xmm5            # xmm5 = mem[0],zero
        vunpcklpd       %xmm4, %xmm5, %xmm4     # xmm4 = xmm5[0],xmm4[0]
        vblendvpd       %xmm3, %xmm2, %xmm4, %xmm2
        vmovsd  32(%rsp,%rsi,8), %xmm3          # xmm3 = mem[0],zero
        vmovsd  32(%rsp,%r8,8), %xmm4           # xmm4 = mem[0],zero
        vunpcklpd       %xmm3, %xmm4, %xmm3     # xmm3 = xmm4[0],xmm3[0]
        vmovsd  (%rsp,%rdi,8), %xmm4            # xmm4 = mem[0],zero
        vmovsd  (%rsp,%rcx,8), %xmm5            # xmm5 = mem[0],zero
        vunpcklpd       %xmm4, %xmm5, %xmm4     # xmm4 = xmm5[0],xmm4[0]
        vblendvpd       %xmm0, %xmm3, %xmm4, %xmm0
        vpcmpgtq        %xmm2, %xmm1, %xmm2
        vextractf128    $1, %ymm1, %xmm1
        vpcmpgtq        %xmm0, %xmm1, %xmm0
        vinsertf128     $1, %xmm0, %ymm2, %ymm0
        movq    %rbp, %rsp
        popq    %rbp
        .cfi_def_cfa %rsp, 8
        retq

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210516/2618f82f/attachment.html>