<html>
    <head>
      <base href="https://bugs.llvm.org/">
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW - Inner product of two vectors gives inefficient code"
   href="https://bugs.llvm.org/show_bug.cgi?id=32164">32164</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Inner product of two vectors gives inefficient code
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>new-bugs
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>enhancement
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>new bugs
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>drraph@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Consider:

typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
typedef double  __attribute__( ( vector_size( 32 ) ) ) float64x4_t;

float dotf(float32x4_t x, float32x4_t y) {
  float ret=0;
  for (int i=0;i!=4;++i) ret+=x[i]*y[i];
  return ret;
}

double dotd(float64x4_t x, float64x4_t y) {
  double ret=0;
  for (int i=0;i!=4;++i) ret+=x[i]*y[i];
  return ret;
}

clang trunk with  -Ofast -march=corei7  gives:

dotf:                                   # @dotf
        mulps   xmm0, xmm1
        shufps  xmm0, xmm0, 27          # xmm0 = xmm0[3,2,1,0]
        movaps  xmm1, xmm0
        movhlps xmm1, xmm1              # xmm1 = xmm1[1,1]
        addps   xmm1, xmm0
        haddps  xmm1, xmm1
        movaps  xmm0, xmm1
        ret

dotd:                                   # @dotd
        push    rbp
        mov     rbp, rsp
        and     rsp, -32
        sub     rsp, 32
        movapd  xmm1, xmmword ptr [rbp + 16]
        movapd  xmm0, xmmword ptr [rbp + 32]
        mulpd   xmm1, xmmword ptr [rbp + 48]
        mulpd   xmm0, xmmword ptr [rbp + 64]
        addpd   xmm0, xmm1
        shufpd  xmm0, xmm0, 1           # xmm0 = xmm0[1,0]
        haddpd  xmm0, xmm0
        mov     rsp, rbp
        pop     rbp
        ret


gcc gives more efficient code using the same flags:

dotf:
        mulps   xmm0, xmm1
        haddps  xmm0, xmm0
        haddps  xmm0, xmm0
        ret
dotd:
        movapd  xmm0, XMMWORD PTR [rsp+8]
        movapd  xmm1, XMMWORD PTR [rsp+56]
        mulpd   xmm0, XMMWORD PTR [rsp+40]
        mulpd   xmm1, XMMWORD PTR [rsp+24]
        addpd   xmm0, xmm1
        haddpd  xmm0, xmm0
        ret

In particular, clang appears to have an unnecessary prologue and epilogue and
an extra shuffle in the case of "double" and an extra shuffle and some more
work in the case of "float".</pre>
        </div>
      </p>


      <hr>
      <span>You are receiving this mail because:</span>

      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>