[llvm-bugs] [Bug 30845] New: llvm does not recognize widening vector multiplication

Mon Oct 31 07:34:42 PDT 2016

https://llvm.org/bugs/show_bug.cgi?id=30845

            Bug ID: 30845
           Summary: llvm does not recognize widening vector multiplication
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedbugs at nondot.org
          Reporter: sroland at vmware.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Since llvm does not have widening muls (I'm looking at unsigned 32/32->64bit
specifically), this has to be done by using zext on the inputs (I didn't try
signed, but the same logic should apply using sext), followed by the mul.

Unfortunately, llvm will not recognize that this is really a 32/32->64bit mul,
which can be done natively with pmuludq, and instead perform "real"
64/64->64bit mul (3 pmuludq, 4 shifts, 3 adds as far as arithmetic goes instead
of a single pmuludq).

Requiring the high bits of the mul result is an operation needed quite a bit
(e.g. opencl mul_hi will return the high bits of such a 32/32->64bit mul, or if
you want to simply do overflow checking on a 32bit mul).

This code basically does such a mul_hi returning the high bits of a
32/32->64bit unsigned mul (obviously, this actually needs 2 pmuludq plus some
input/output shuffling (or shifts instead of shuffles)):

; llc-3.7 -mattr=sse2 -o - umul.ll

define <4 x i32> @umul(<4 x i32> %val1, <4 x i32> %val2) {
entry:
  %val1a = zext <4 x i32> %val1 to <4 x i64>
  %val2a = zext <4 x i32> %val2 to <4 x i64>
  %res64 = mul <4 x i64> %val1a, %val2a
  %rescast = bitcast <4 x i64> %res64 to <8 x i32>
  %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1,
i32 3, i32 5, i32 7>
  ret <4 x i32> %res
}

And it compiles to this monstrosity (with llvm master, but pretty much the same
with older versions):
        pxor    %xmm4, %xmm4
        movdqa  %xmm0, %xmm2
        punpckhdq       %xmm4, %xmm2    # xmm2 =
xmm2[2],xmm4[2],xmm2[3],xmm4[3]
        punpckldq       %xmm4, %xmm0    # xmm0 =
xmm0[0],xmm4[0],xmm0[1],xmm4[1]
        movdqa  %xmm1, %xmm3
        punpckhdq       %xmm4, %xmm3    # xmm3 =
xmm3[2],xmm4[2],xmm3[3],xmm4[3]
        punpckldq       %xmm4, %xmm1    # xmm1 =
xmm1[0],xmm4[0],xmm1[1],xmm4[1]
        movdqa  %xmm0, %xmm4
        pmuludq %xmm1, %xmm4
        movdqa  %xmm1, %xmm5
        psrlq   $32, %xmm5
        pmuludq %xmm0, %xmm5
        psllq   $32, %xmm5
        psrlq   $32, %xmm0
        pmuludq %xmm1, %xmm0
        psllq   $32, %xmm0
        paddq   %xmm5, %xmm0
        paddq   %xmm4, %xmm0
        movdqa  %xmm2, %xmm1
        pmuludq %xmm3, %xmm1
        movdqa  %xmm3, %xmm4
        psrlq   $32, %xmm4
        pmuludq %xmm2, %xmm4
        psllq   $32, %xmm4
        psrlq   $32, %xmm2
        pmuludq %xmm3, %xmm2
        psllq   $32, %xmm2
        paddq   %xmm4, %xmm2
        paddq   %xmm1, %xmm2
        pshufd  $237, %xmm2, %xmm1      # xmm1 = xmm2[1,3,2,3]
        pshufd  $237, %xmm0, %xmm0      # xmm0 = xmm0[1,3,2,3]
        punpcklqdq      %xmm1, %xmm0    # xmm0 = xmm0[0],xmm1[0]
        retq

The failure to recognize the widening pattern also happens with avx2, or if
just multiplying the lower 2 of the 4 numbers, returning 64bit results (though
obviously there's only half as much code in this case, skipping the second
half).

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20161031/96804e7c/attachment.html>