[llvm-bugs] [Bug 31677] New: clang/llvm fails to vectorize the product of a complex array

Wed Jan 18 07:10:31 PST 2017

https://llvm.org/bugs/show_bug.cgi?id=31677

            Bug ID: 31677
           Summary: clang/llvm fails to vectorize the product of a complex
                    array
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: drraph at gmail.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Consider this simple piece of code which takes the product of an array
of complex numbers.

#include <complex.h>
complex float f(complex float x[]) {
  complex float p = 1.0;
  for (int i = 0; i < 32; i++)
    p *= x[i];
  return p;
}

If I compile it with -O3 -march=bdver2 -ffast-math  using clang 3.9.1 I get
unvectorised assembly.

.LCPI0_0:
        .long   1065353216              # float 1
f:                                      # @f
        vxorps  xmm1, xmm1, xmm1
        vmovss  xmm0, dword ptr [rip + .LCPI0_0] # xmm0 = mem[0],zero,zero,zero
        xor     eax, eax
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        vmovss  xmm2, dword ptr [rdi + 8*rax] # xmm2 = mem[0],zero,zero,zero
        vmovss  xmm3, dword ptr [rdi + 8*rax + 4] # xmm3 =
mem[0],zero,zero,zero
        vmulss  xmm4, xmm2, xmm1
        vmulss  xmm5, xmm3, xmm1
        vfmaddss        xmm1, xmm3, xmm0, xmm4
        vfmsubss        xmm0, xmm2, xmm0, xmm5
        inc     rax
        cmp     rax, 32
        jne     .LBB0_1
        vinsertps       xmm0, xmm0, xmm1, 16 # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
        ret

As a test I also tried icc (the Intel Compiler) which does appear to
give vectorised code. 

f:
        xor       eax, eax                                      #4.3
        movups    xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip]   #3.19
..B1.2:                         # Preds ..B1.2 ..B1.1
        movups    xmm0, XMMWORD PTR [rdi+rax*8]                 #5.10
        movups    xmm4, XMMWORD PTR [32+rdi+rax*8]              #5.10
        movups    xmm6, XMMWORD PTR [48+rdi+rax*8]              #5.10
        movups    xmm8, XMMWORD PTR [64+rdi+rax*8]              #5.10
        movups    xmm10, XMMWORD PTR [80+rdi+rax*8]             #5.10
        movups    xmm12, XMMWORD PTR [96+rdi+rax*8]             #5.10
        movups    xmm14, XMMWORD PTR [112+rdi+rax*8]            #5.10
        movaps    xmm2, xmm0                                    #5.5
        shufps    xmm2, xmm0, 160                               #5.5
        mulps     xmm2, xmm1                                    #5.5
        xorps     xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm1, xmm1, 177                               #5.5
        shufps    xmm0, xmm0, 245                               #5.5
        mulps     xmm1, xmm0                                    #5.5
        addps     xmm2, xmm1                                    #5.5
        movups    xmm1, XMMWORD PTR [16+rdi+rax*8]              #5.10
        movaps    xmm3, xmm2                                    #5.5
        add       rax, 16                                       #4.3
        shufps    xmm3, xmm2, 160                               #5.5
        mulps     xmm3, xmm1                                    #5.5
        xorps     xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm1, xmm1, 177                               #5.5
        shufps    xmm2, xmm2, 245                               #5.5
        mulps     xmm1, xmm2                                    #5.5
        addps     xmm3, xmm1                                    #5.5
        movaps    xmm5, xmm3                                    #5.5
        shufps    xmm5, xmm3, 160                               #5.5
        mulps     xmm5, xmm4                                    #5.5
        xorps     xmm4, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm4, xmm4, 177                               #5.5
        shufps    xmm3, xmm3, 245                               #5.5
        mulps     xmm4, xmm3                                    #5.5
        addps     xmm5, xmm4                                    #5.5
        movaps    xmm7, xmm5                                    #5.5
        shufps    xmm7, xmm5, 160                               #5.5
        mulps     xmm7, xmm6                                    #5.5
        xorps     xmm6, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm6, xmm6, 177                               #5.5
        shufps    xmm5, xmm5, 245                               #5.5
        mulps     xmm6, xmm5                                    #5.5
        addps     xmm7, xmm6                                    #5.5
        movaps    xmm9, xmm7                                    #5.5
        shufps    xmm9, xmm7, 160                               #5.5
        mulps     xmm9, xmm8                                    #5.5
        xorps     xmm8, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm8, xmm8, 177                               #5.5
        shufps    xmm7, xmm7, 245                               #5.5
        mulps     xmm8, xmm7                                    #5.5
        addps     xmm9, xmm8                                    #5.5
        movaps    xmm11, xmm9                                   #5.5
        shufps    xmm11, xmm9, 160                              #5.5
        mulps     xmm11, xmm10                                  #5.5
        xorps     xmm10, XMMWORD PTR .L_2il0floatpacket.1[rip]  #5.5
        shufps    xmm10, xmm10, 177                             #5.5
        shufps    xmm9, xmm9, 245                               #5.5
        mulps     xmm10, xmm9                                   #5.5
        addps     xmm11, xmm10                                  #5.5
        movaps    xmm13, xmm11                                  #5.5
        shufps    xmm13, xmm11, 160                             #5.5
        mulps     xmm13, xmm12                                  #5.5
        xorps     xmm12, XMMWORD PTR .L_2il0floatpacket.1[rip]  #5.5
        shufps    xmm12, xmm12, 177                             #5.5
        shufps    xmm11, xmm11, 245                             #5.5
        mulps     xmm12, xmm11                                  #5.5
        addps     xmm13, xmm12                                  #5.5
        movaps    xmm1, xmm13                                   #5.5
        shufps    xmm1, xmm13, 160                              #5.5
        mulps     xmm1, xmm14                                   #5.5
        xorps     xmm14, XMMWORD PTR .L_2il0floatpacket.1[rip]  #5.5
        shufps    xmm14, xmm14, 177                             #5.5
        shufps    xmm13, xmm13, 245                             #5.5
        mulps     xmm14, xmm13                                  #5.5
        addps     xmm1, xmm14                                   #5.5
        cmp       rax, 32                                       #4.3
        jb        ..B1.2        # Prob 96%                      #4.3
        movaps    xmm2, xmm1                                    #3.19
        movhlps   xmm2, xmm1                                    #3.19
        movaps    xmm0, xmm2                                    #3.19
        shufps    xmm0, xmm2, 160                               #3.19
        mulps     xmm0, xmm1                                    #3.19
        xorps     xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip]   #3.19
        shufps    xmm1, xmm1, 177                               #3.19
        shufps    xmm2, xmm2, 245                               #3.19
        mulps     xmm1, xmm2                                    #3.19
        addps     xmm0, xmm1                                    #3.19
        ret                                                     #6.10
.L_2il0floatpacket.0:
        .long   0x3f800000,0x00000000,0x3f800000,0x00000000
.L_2il0floatpacket.1:
        .long   0x00000000,0x80000000,0x00000000,0x80000000

Interestingly, clang *can* vectorise

#include <complex.h>
complex float f(complex float x[]) {
  complex float p = 1.0;
  for (int i = 0; i < 32; i++)
    p += x[i]; /* <--- + instead of * */ 
  return p;
}

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170118/812139d5/attachment-0001.html>