[llvm-bugs] [Bug 50323] New: Wrong code generated with -fslp-vectorize

Wed May 12 21:14:52 PDT 2021

https://bugs.llvm.org/show_bug.cgi?id=50323

            Bug ID: 50323
           Summary: Wrong code generated with -fslp-vectorize
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: kazu at google.com
                CC: craig.topper at gmail.com, llvm-bugs at lists.llvm.org,
                    llvm-dev at redking.me.uk, pengfei.wang at intel.com,
                    spatel+llvm at rotateright.com

I'm seeing a miscompilation triggered by -fslp-vectorize.

Consider:

#include <stdint.h>
#include <immintrin.h>
#include <stdio.h>

struct Pair {
  __m256i lo;
  __m256i hi;
};

static inline int64_t Extract(const Pair& v, int index) {
  return index < 4 ? v.lo[index] : v.hi[index - 4];
}

// This function gets miscompiled.  This is a lot like
_mm512_permutexvar_epi64.
// It takes a pair of __m256i along with an array of indexes and returns a
// permutation in __m256i.
__attribute__((noinline)) __m256i Permute(Pair a, __m256i map) {
  int64_t result[] = {0, 0, 0, 0};
  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result), a.lo);
  result[0] = Extract(a, map[0] & 0x7);
  result[1] = Extract(a, map[1] & 0x7);
  result[2] = Extract(a, map[2] & 0x7);
  result[3] = Extract(a, map[3] & 0x7);
  return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(result));
}

int main() {
  Pair v;
  v.lo = _mm256_set_epi64x(0xa3, 0xa2, 0xa1, 0xa0);
  v.hi = _mm256_set_epi64x(0xa7, 0xa6, 0xa5, 0xa4);
  __m256i r = Permute(v, _mm256_set_epi64x(2, 3, 4, 5));
  printf("%02x %02x %02x %02x\n",
         (int)_mm256_extract_epi64(r, 3),
         (int)_mm256_extract_epi64(r, 2),
         (int)_mm256_extract_epi64(r, 1),
         (int)_mm256_extract_epi64(r, 0));
  return 0;
}

With the latest clang (a0ca4c46ca35957a38a6023fa84afda2fc9ba0ec), I see:

$ ./release/bin/clang++ -O3 -mavx -fno-slp-vectorize permute.cc ; ./a.out
a2 a3 a4 a5
$ ./release/bin/clang++ -O3 -mavx -fslp-vectorize    permute.cc ; ./a.out
a0 a3 a4 a5

Notice that the top lane is different -- 0xa0 v.s. 0xa2.

Here is the assembly output for Permute:

        .text
        .file   "permute.cc"
        .globl  _Z7Permute4PairDv4_x            # -- Begin function
_Z7Permute4PairDv4_x
        .p2align        4, 0x90
        .type   _Z7Permute4PairDv4_x, at function
_Z7Permute4PairDv4_x:                   # @_Z7Permute4PairDv4_x
        .cfi_startproc
# %bb.0:
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        andq    $-32, %rsp
        subq    $96, %rsp
        vextractf128    $1, %ymm0, %xmm1
        vpextrd $2, %xmm1, %eax
        movl    %eax, %edx
        andl    $7, %edx
        vmovaps 16(%rbp), %ymm2
        vmovaps 48(%rbp), %ymm3
        vmovaps %ymm2, (%rsp)
        vmovaps %ymm3, 32(%rsp)
        vmovd   %xmm1, %ecx
        subl    $4, %edx
        jae     .LBB0_1
# %bb.2:
        andl    $3, %eax
        movq    (%rsp,%rax,8), %rax
        jmp     .LBB0_3
.LBB0_1:
        andl    $3, %edx
        movq    32(%rsp,%rdx,8), %rax
.LBB0_3:
        movl    %ecx, %esi
        andl    $7, %esi
        vpextrd $2, %xmm0, %edx
        subl    $4, %esi
        jae     .LBB0_4
# %bb.5:
        andl    $3, %ecx
        movq    (%rsp,%rcx,8), %rcx
        jmp     .LBB0_6
.LBB0_4:
        andl    $3, %esi
        movq    32(%rsp,%rsi,8), %rcx
.LBB0_6:
        movl    %edx, %edi
        andl    $7, %edi
        vmovd   %xmm0, %esi
        subl    $4, %edi
        jae     .LBB0_7
# %bb.8:
        andl    $3, %edx
        movq    (%rsp,%rdx,8), %rdx
        movl    %esi, %edi
        andl    $7, %edi
        subl    $4, %edi
        jb      .LBB0_11
.LBB0_10:
        andl    $3, %edi
        movq    32(%rsp,%rdi,8), %rsi
        jmp     .LBB0_12
.LBB0_7:
        andl    $3, %edi
        movq    32(%rsp,%rdi,8), %rdx
        movl    %esi, %edi
        andl    $7, %edi
        subl    $4, %edi
        jae     .LBB0_10
.LBB0_11:
        andl    $3, %esi
        movq    (%rsp,%rsi,8), %rsi
.LBB0_12:
        vmovq   %rax, %xmm0
        vmovq   %rcx, %xmm1
        vpunpcklqdq     %xmm0, %xmm1, %xmm0     # xmm0 = xmm1[0],xmm0[0]
        vmovq   %rdx, %xmm1
        vmovq   %rsi, %xmm2
        vpunpcklqdq     %xmm1, %xmm2, %xmm1     # xmm1 = xmm2[0],xmm1[0]
        vinsertf128     $1, %xmm0, %ymm1, %ymm0
        movq    %rbp, %rsp
        popq    %rbp
        .cfi_def_cfa %rsp, 8
        retq

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20210513/0010089e/attachment.html>