[llvm-bugs] [Bug 31111] New: vectors with length 3 and width 8 or 16 cannot be efficiently loaded

Mon Nov 21 20:35:08 PST 2016

https://llvm.org/bugs/show_bug.cgi?id=31111

            Bug ID: 31111
           Summary: vectors with length 3 and width 8 or 16 cannot be
                    efficiently loaded
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: sroland at vmware.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

This is probably not really important, more noticed this by accident, but
fetching <3 x i8> and <3 x i16> values (to fit into <16 x i8> and <8 x i16>
vectors) requires manual decomposition to generate efficient loads.
(Obviously, just (unaligned) fetching a vector with 4 elements would not have
these problems, but that may not always be possible.)

The standard way we to do this (note that usually we'd fill up the rest of the
vector with more elements) is:
define <4 x i32> @load3x8(i8* %ptr, i32 %offset) {
entry:
  %0 = getelementptr i8, i8* %ptr, i32 %offset
  %ad0 = bitcast i8* %0 to i24*
  %va0 = load i24, i24* %ad0, align 1
  %va0z = zext i24 %va0 to i32
  %shuf = insertelement <4 x i32> undef, i32 %va0z, i32 0
  ret <4 x i32> %shuf
}

The code generated for that is ok, but due to the scalar zext has some extra
instructions:
        movslq  %esi, %rcx
        movzwl  (%rdi,%rcx), %eax
        movzbl  2(%rdi,%rcx), %ecx
        shll    $16, %ecx
        orl     %eax, %ecx
        vmovd   %ecx, %xmm0
        ret

So, trying to get rid of that, an attempt would look like this:
define <4 x i32> @load3x8vec(i8* %ptr, i32 %offset) {
entry:
  %0 = getelementptr i8, i8* %ptr, i32 %offset
  %ad0 = bitcast i8* %0 to <3 x i8>*
  %va0 = load <3 x i8>, <3 x i8>* %ad0, align 1
  %va0s = shufflevector <3 x i8> %va0, <3 x i8> undef, <16 x i32> <i32 0, i32
1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %res = bitcast <16 x i8> %va0s to <4 x i32>
  ret <4 x i32> %res
}

This could be done with just one pinsrw, one pinsrb (with sse41 at least),
however llvm produces this wtf piece of assembly (with avx, the sse2 case looks
different but just as terrible):
        movslq  %esi, %rax
        movzwl  (%rdi,%rax), %ecx
        vmovd   %rcx, %xmm0
        vpshufb .LCPI1_0(%rip), %xmm0, %xmm0
        vmovd   %xmm0, %ecx
        movzbl  2(%rdi,%rax), %eax
        vmovd   %ecx, %xmm0
        vpextrb $0, %xmm0, %ecx
        vpextrb $1, %xmm0, %edx
        shll    $8, %edx
        orl     %ecx, %edx
        vpinsrw $0, %edx, %xmm0, %xmm0
        vpinsrw $1, %eax, %xmm0, %xmm0
        ret
There is some very serious anti-optimization going on here...

The only way to force llvm to spit out efficient code seems to be to decompose
this into 2 loads manually, which is quite annoying:
define <4 x i32> @load3x8manual(i8* %ptr, i32 %offset) {
entry:
  %0 = getelementptr i8, i8* %ptr, i32 %offset
  %ptr1 = getelementptr i8, i8* %ptr, i32 2
  %ad0b = getelementptr i8, i8* %ptr1, i32 %offset
  %ad0 = bitcast i8* %0 to i16*
  %va0w = load i16, i16* %ad0, align 1
  %va0b = load i8, i8* %ad0b, align 1
  %shuf0 = insertelement <8 x i16> undef, i16 %va0w, i32 0
  %shufc = bitcast <8 x i16> %shuf0 to <16 x i8>
  %shuf1 = insertelement <16 x i8> %shufc, i8 %va0b, i32 2
  %res = bitcast <16 x i8> %shuf1 to <4 x i32>
  ret <4 x i32> %res
}

which produces:
        movslq  %esi, %rax
        movzwl  (%rdi,%rax), %ecx
        vmovd   %ecx, %xmm0
        vpinsrb $2, 2(%rdi,%rax), %xmm0, %xmm0
        ret
(Note this is still fail as it should just be vpinsrw instead of the
movzwl+vmovd - however when actually fetching 3 more such values to fill up the
vector then llvm will indeed use the vpinsrw instead. And with sse2 only this
doesn't turn out that well but it's probably impossible to beat the scalar
i24->i32 ZExt there.)

The <3 x i16> case pretty much fails all the same, still resorting to some
seriously weird scalar extraction... However, fetching a <3 x i32> vector that
way works fine without hand-holding (fetching as i96 still produces some extra
instructions, but fetching as a vector works fine):
define <4 x i32> @load3x32vec(i8* %ptr, i32 %offset) {
entry:
  %0 = getelementptr i8, i8* %ptr, i32 %offset
  %ad0 = bitcast i8* %0 to <3 x i32>*
  %va0 = load <3 x i32>, <3 x i32>* %ad0, align 4
  %shuf = shufflevector <3 x i32> %va0, <3 x i32> undef, <4 x i32> <i32 0, i32
1, i32 2, i32 undef>
  ret <4 x i32> %shuf
}

        movslq  %esi, %rax
        vmovq   (%rdi,%rax), %xmm0
        vpinsrd $2, 8(%rdi,%rax), %xmm0, %xmm0
        ret

(Though with sse2 only llvm will do:
        movslq  %esi, %rax
        movq    (%rdi,%rax), %xmm0      # xmm0 = mem[0],zero
        movd    8(%rdi,%rax), %xmm1     # xmm1 = mem[0],zero,zero,zero
        shufps  $48, %xmm0, %xmm1       # xmm1 = xmm1[0,0],xmm0[3,0]
        shufps  $132, %xmm1, %xmm0      # xmm0 = xmm0[0,1],xmm1[0,2]
        retq
Which is definitely one shufps too many - even a simple unpack would have done)

(I didn't try other non-power-of-two vector lengths, I sort of doubt things
would look better - but we definitely have no use for them)

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20161122/825e2c28/attachment.html>