[llvm-bugs] [Bug 29025] New: <12 x i8> shuffle is lowered to a sequence of extracts and inserts.

Wed Aug 17 17:08:53 PDT 2016

https://llvm.org/bugs/show_bug.cgi?id=29025

            Bug ID: 29025
           Summary: <12 x i8> shuffle is lowered to a sequence of extracts
                    and inserts.
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: mkuper at google.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Consider three equivalent IR functions:

define void @bad(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) {
  %s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2,
i32 3, i32 4, i32 5, i32 6, i32 7>
  %s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32
2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32
8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
  store <12 x i8> %r, <12 x i8>* %p, align 1
  ret void
}

define void @better(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) {
  %s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2,
i32 3, i32 4, i32 5, i32 6, i32 7>
  %s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32
2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  br label %foo
foo:    
  %r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32
8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>  
  store <12 x i8> %r, <12 x i8>* %p, align 1
  ret void
}

define void @best(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) {
  %s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2,
i32 3, i32 4, i32 5, i32 6, i32 7>
  %s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32
2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 4, i32
8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11, i32 undef,
i32 undef, i32 undef, i32 undef>  
  br label %foo
foo:    
  %r2 = shufflevector <16 x i8> %r, <16 x i8> undef, <12 x i32> <i32 0, i32 1,
i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>  
  store <12 x i8> %r2, <12 x i8>* %p, align 1
  ret void
}

When compiling for AVX, for 'bad', we get:
    vpextrb    $0, %xmm1, %eax
    vpinsrb    $1, %eax, %xmm0, %xmm3
    vpextrb    $0, %xmm2, %eax
    vpinsrb    $2, %eax, %xmm3, %xmm3
    vpextrb    $4, %xmm0, %eax
    vpinsrb    $3, %eax, %xmm3, %xmm3
    vpextrb    $4, %xmm1, %eax
    vpinsrb    $4, %eax, %xmm3, %xmm3
    vpextrb    $4, %xmm2, %eax
    vpinsrb    $5, %eax, %xmm3, %xmm3
    vpextrb    $8, %xmm0, %eax
    vpinsrb    $6, %eax, %xmm3, %xmm3
    vpextrb    $8, %xmm1, %eax
    vpinsrb    $7, %eax, %xmm3, %xmm3
    vpextrb    $8, %xmm2, %eax
    vpinsrb    $8, %eax, %xmm3, %xmm3
    vpextrb    $12, %xmm0, %eax
    vpinsrb    $9, %eax, %xmm3, %xmm0
    vpextrb    $12, %xmm1, %eax
    vpinsrb    $10, %eax, %xmm0, %xmm0
    vpextrb    $12, %xmm2, %eax
    vpinsrb    $11, %eax, %xmm0, %xmm0
    vpextrd    $2, %xmm0, 8(%rdi)
    vmovq    %xmm0, (%rdi)
    retq

For 'better':
    vmovdqa    .LCPI1_0(%rip), %xmm3
    vpshufb    %xmm3, %xmm1, %xmm1
    vpshufb    %xmm3, %xmm0, %xmm0
    vpunpcklqdq    %xmm1, %xmm0, %xmm0
    vpshufb    %xmm3, %xmm2, %xmm1
    vpshufb    .LCPI1_1(%rip), %xmm1, %xmm1
    vpshufb    .LCPI1_2(%rip), %xmm0, %xmm0
    vpor    %xmm1, %xmm0, %xmm0
    vpextrd    $2, %xmm0, 8(%rdi)
    vmovq    %xmm0, (%rdi)
    retq

And for 'best':
    vmovdqa    .LCPI2_0(%rip), %xmm3
    vpshufb    %xmm3, %xmm1, %xmm1
    vpshufb    %xmm3, %xmm0, %xmm0
    vpunpckldq    %xmm1, %xmm0, %xmm0
    vpshufb    %xmm3, %xmm2, %xmm1
    vpunpcklbw    %xmm1, %xmm0, %xmm0
    vpshufb    .LCPI2_1(%rip), %xmm0, %xmm0
    vpextrd    $2, %xmm0, 8(%rdi)
    vmovq    %xmm0, (%rdi)

The first problem is that when constructing a <12 x i8> out of two <8 x i8>-s,
isel produces a series of extracts + build_vector instead of a vector_shuffle,
because the mask length is not a multiple of the input length. This happens for
both 'bad' and 'better'. 'best' avoids this problem, because it never sees a
v12i8 being constructed directly, only extracted from a v16i8. 

The second problem is that for 'bad' we don't manage to clean this up. In
'better', we are saved by a dagcombine, but for 'bad', no such luck. This has
to do with the way we end up legalizing the sources of the build_vector. In
'better' we extract from two v8i16 vectors, and in 'bad' from three v4i32
vectors and an undef.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160818/b5ed44bc/attachment-0001.html>