[llvm-bugs] [Bug 29025] New: <12 x i8> shuffle is lowered to a sequence of extracts and inserts.
via llvm-bugs
llvm-bugs at lists.llvm.org
Wed Aug 17 17:08:53 PDT 2016
https://llvm.org/bugs/show_bug.cgi?id=29025
Bug ID: 29025
Summary: <12 x i8> shuffle is lowered to a sequence of extracts
and inserts.
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: mkuper at google.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
Consider three equivalent IR functions:
define void @bad(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) {
%s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2,
i32 3, i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32
2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32
8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
store <12 x i8> %r, <12 x i8>* %p, align 1
ret void
}
define void @better(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) {
%s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2,
i32 3, i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32
2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
br label %foo
foo:
%r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32
8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
store <12 x i8> %r, <12 x i8>* %p, align 1
ret void
}
define void @best(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) {
%s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2,
i32 3, i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32
2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 4, i32
8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11, i32 undef,
i32 undef, i32 undef, i32 undef>
br label %foo
foo:
%r2 = shufflevector <16 x i8> %r, <16 x i8> undef, <12 x i32> <i32 0, i32 1,
i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
store <12 x i8> %r2, <12 x i8>* %p, align 1
ret void
}
When compiling for AVX, for 'bad', we get:
vpextrb $0, %xmm1, %eax
vpinsrb $1, %eax, %xmm0, %xmm3
vpextrb $0, %xmm2, %eax
vpinsrb $2, %eax, %xmm3, %xmm3
vpextrb $4, %xmm0, %eax
vpinsrb $3, %eax, %xmm3, %xmm3
vpextrb $4, %xmm1, %eax
vpinsrb $4, %eax, %xmm3, %xmm3
vpextrb $4, %xmm2, %eax
vpinsrb $5, %eax, %xmm3, %xmm3
vpextrb $8, %xmm0, %eax
vpinsrb $6, %eax, %xmm3, %xmm3
vpextrb $8, %xmm1, %eax
vpinsrb $7, %eax, %xmm3, %xmm3
vpextrb $8, %xmm2, %eax
vpinsrb $8, %eax, %xmm3, %xmm3
vpextrb $12, %xmm0, %eax
vpinsrb $9, %eax, %xmm3, %xmm0
vpextrb $12, %xmm1, %eax
vpinsrb $10, %eax, %xmm0, %xmm0
vpextrb $12, %xmm2, %eax
vpinsrb $11, %eax, %xmm0, %xmm0
vpextrd $2, %xmm0, 8(%rdi)
vmovq %xmm0, (%rdi)
retq
For 'better':
vmovdqa .LCPI1_0(%rip), %xmm3
vpshufb %xmm3, %xmm1, %xmm1
vpshufb %xmm3, %xmm0, %xmm0
vpunpcklqdq %xmm1, %xmm0, %xmm0
vpshufb %xmm3, %xmm2, %xmm1
vpshufb .LCPI1_1(%rip), %xmm1, %xmm1
vpshufb .LCPI1_2(%rip), %xmm0, %xmm0
vpor %xmm1, %xmm0, %xmm0
vpextrd $2, %xmm0, 8(%rdi)
vmovq %xmm0, (%rdi)
retq
And for 'best':
vmovdqa .LCPI2_0(%rip), %xmm3
vpshufb %xmm3, %xmm1, %xmm1
vpshufb %xmm3, %xmm0, %xmm0
vpunpckldq %xmm1, %xmm0, %xmm0
vpshufb %xmm3, %xmm2, %xmm1
vpunpcklbw %xmm1, %xmm0, %xmm0
vpshufb .LCPI2_1(%rip), %xmm0, %xmm0
vpextrd $2, %xmm0, 8(%rdi)
vmovq %xmm0, (%rdi)
The first problem is that when constructing a <12 x i8> out of two <8 x i8>-s,
isel produces a series of extracts + build_vector instead of a vector_shuffle,
because the mask length is not a multiple of the input length. This happens for
both 'bad' and 'better'. 'best' avoids this problem, because it never sees a
v12i8 being constructed directly, only extracted from a v16i8.
The second problem is that for 'bad' we don't manage to clean this up. In
'better', we are saved by a dagcombine, but for 'bad', no such luck. This has
to do with the way we end up legalizing the sources of the build_vector. In
'better' we extract from two v8i16 vectors, and in 'bad' from three v4i32
vectors and an undef.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160818/b5ed44bc/attachment-0001.html>
More information about the llvm-bugs
mailing list