[llvm-bugs] [Bug 32164] New: Inner product of two vectors gives inefficient code

Tue Mar 7 02:23:09 PST 2017

https://bugs.llvm.org/show_bug.cgi?id=32164

            Bug ID: 32164
           Summary: Inner product of two vectors gives inefficient code
           Product: new-bugs
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: drraph at gmail.com
                CC: llvm-bugs at lists.llvm.org

Consider:

typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
typedef double  __attribute__( ( vector_size( 32 ) ) ) float64x4_t;

float dotf(float32x4_t x, float32x4_t y) {
  float ret=0;
  for (int i=0;i!=4;++i) ret+=x[i]*y[i];
  return ret;
}

double dotd(float64x4_t x, float64x4_t y) {
  double ret=0;
  for (int i=0;i!=4;++i) ret+=x[i]*y[i];
  return ret;
}

clang trunk with  -Ofast -march=corei7  gives:

dotf:                                   # @dotf
        mulps   xmm0, xmm1
        shufps  xmm0, xmm0, 27          # xmm0 = xmm0[3,2,1,0]
        movaps  xmm1, xmm0
        movhlps xmm1, xmm1              # xmm1 = xmm1[1,1]
        addps   xmm1, xmm0
        haddps  xmm1, xmm1
        movaps  xmm0, xmm1
        ret

dotd:                                   # @dotd
        push    rbp
        mov     rbp, rsp
        and     rsp, -32
        sub     rsp, 32
        movapd  xmm1, xmmword ptr [rbp + 16]
        movapd  xmm0, xmmword ptr [rbp + 32]
        mulpd   xmm1, xmmword ptr [rbp + 48]
        mulpd   xmm0, xmmword ptr [rbp + 64]
        addpd   xmm0, xmm1
        shufpd  xmm0, xmm0, 1           # xmm0 = xmm0[1,0]
        haddpd  xmm0, xmm0
        mov     rsp, rbp
        pop     rbp
        ret

gcc gives more efficient code using the same flags:

dotf:
        mulps   xmm0, xmm1
        haddps  xmm0, xmm0
        haddps  xmm0, xmm0
        ret
dotd:
        movapd  xmm0, XMMWORD PTR [rsp+8]
        movapd  xmm1, XMMWORD PTR [rsp+56]
        mulpd   xmm0, XMMWORD PTR [rsp+40]
        mulpd   xmm1, XMMWORD PTR [rsp+24]
        addpd   xmm0, xmm1
        haddpd  xmm0, xmm0
        ret

In particular, clang appears to have an unnecessary prologue and epilogue and
an extra shuffle in the case of "double" and an extra shuffle and some more
work in the case of "float".

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170307/619ecf15/attachment.html>