[llvm-bugs] [Bug 31111] New: vectors with length 3 and width 8 or 16 cannot be efficiently loaded
via llvm-bugs
llvm-bugs at lists.llvm.org
Mon Nov 21 20:35:08 PST 2016
https://llvm.org/bugs/show_bug.cgi?id=31111
Bug ID: 31111
Summary: vectors with length 3 and width 8 or 16 cannot be
efficiently loaded
Product: libraries
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: sroland at vmware.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
This is probably not really important, more noticed this by accident, but
fetching <3 x i8> and <3 x i16> values (to fit into <16 x i8> and <8 x i16>
vectors) requires manual decomposition to generate efficient loads.
(Obviously, just (unaligned) fetching a vector with 4 elements would not have
these problems, but that may not always be possible.)
The standard way we to do this (note that usually we'd fill up the rest of the
vector with more elements) is:
define <4 x i32> @load3x8(i8* %ptr, i32 %offset) {
entry:
%0 = getelementptr i8, i8* %ptr, i32 %offset
%ad0 = bitcast i8* %0 to i24*
%va0 = load i24, i24* %ad0, align 1
%va0z = zext i24 %va0 to i32
%shuf = insertelement <4 x i32> undef, i32 %va0z, i32 0
ret <4 x i32> %shuf
}
The code generated for that is ok, but due to the scalar zext has some extra
instructions:
movslq %esi, %rcx
movzwl (%rdi,%rcx), %eax
movzbl 2(%rdi,%rcx), %ecx
shll $16, %ecx
orl %eax, %ecx
vmovd %ecx, %xmm0
ret
So, trying to get rid of that, an attempt would look like this:
define <4 x i32> @load3x8vec(i8* %ptr, i32 %offset) {
entry:
%0 = getelementptr i8, i8* %ptr, i32 %offset
%ad0 = bitcast i8* %0 to <3 x i8>*
%va0 = load <3 x i8>, <3 x i8>* %ad0, align 1
%va0s = shufflevector <3 x i8> %va0, <3 x i8> undef, <16 x i32> <i32 0, i32
1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%res = bitcast <16 x i8> %va0s to <4 x i32>
ret <4 x i32> %res
}
This could be done with just one pinsrw, one pinsrb (with sse41 at least),
however llvm produces this wtf piece of assembly (with avx, the sse2 case looks
different but just as terrible):
movslq %esi, %rax
movzwl (%rdi,%rax), %ecx
vmovd %rcx, %xmm0
vpshufb .LCPI1_0(%rip), %xmm0, %xmm0
vmovd %xmm0, %ecx
movzbl 2(%rdi,%rax), %eax
vmovd %ecx, %xmm0
vpextrb $0, %xmm0, %ecx
vpextrb $1, %xmm0, %edx
shll $8, %edx
orl %ecx, %edx
vpinsrw $0, %edx, %xmm0, %xmm0
vpinsrw $1, %eax, %xmm0, %xmm0
ret
There is some very serious anti-optimization going on here...
The only way to force llvm to spit out efficient code seems to be to decompose
this into 2 loads manually, which is quite annoying:
define <4 x i32> @load3x8manual(i8* %ptr, i32 %offset) {
entry:
%0 = getelementptr i8, i8* %ptr, i32 %offset
%ptr1 = getelementptr i8, i8* %ptr, i32 2
%ad0b = getelementptr i8, i8* %ptr1, i32 %offset
%ad0 = bitcast i8* %0 to i16*
%va0w = load i16, i16* %ad0, align 1
%va0b = load i8, i8* %ad0b, align 1
%shuf0 = insertelement <8 x i16> undef, i16 %va0w, i32 0
%shufc = bitcast <8 x i16> %shuf0 to <16 x i8>
%shuf1 = insertelement <16 x i8> %shufc, i8 %va0b, i32 2
%res = bitcast <16 x i8> %shuf1 to <4 x i32>
ret <4 x i32> %res
}
which produces:
movslq %esi, %rax
movzwl (%rdi,%rax), %ecx
vmovd %ecx, %xmm0
vpinsrb $2, 2(%rdi,%rax), %xmm0, %xmm0
ret
(Note this is still fail as it should just be vpinsrw instead of the
movzwl+vmovd - however when actually fetching 3 more such values to fill up the
vector then llvm will indeed use the vpinsrw instead. And with sse2 only this
doesn't turn out that well but it's probably impossible to beat the scalar
i24->i32 ZExt there.)
The <3 x i16> case pretty much fails all the same, still resorting to some
seriously weird scalar extraction... However, fetching a <3 x i32> vector that
way works fine without hand-holding (fetching as i96 still produces some extra
instructions, but fetching as a vector works fine):
define <4 x i32> @load3x32vec(i8* %ptr, i32 %offset) {
entry:
%0 = getelementptr i8, i8* %ptr, i32 %offset
%ad0 = bitcast i8* %0 to <3 x i32>*
%va0 = load <3 x i32>, <3 x i32>* %ad0, align 4
%shuf = shufflevector <3 x i32> %va0, <3 x i32> undef, <4 x i32> <i32 0, i32
1, i32 2, i32 undef>
ret <4 x i32> %shuf
}
movslq %esi, %rax
vmovq (%rdi,%rax), %xmm0
vpinsrd $2, 8(%rdi,%rax), %xmm0, %xmm0
ret
(Though with sse2 only llvm will do:
movslq %esi, %rax
movq (%rdi,%rax), %xmm0 # xmm0 = mem[0],zero
movd 8(%rdi,%rax), %xmm1 # xmm1 = mem[0],zero,zero,zero
shufps $48, %xmm0, %xmm1 # xmm1 = xmm1[0,0],xmm0[3,0]
shufps $132, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,2]
retq
Which is definitely one shufps too many - even a simple unpack would have done)
(I didn't try other non-power-of-two vector lengths, I sort of doubt things
would look better - but we definitely have no use for them)
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20161122/825e2c28/attachment.html>
More information about the llvm-bugs
mailing list