<html>

    <head>

      <base href="https://bugs.llvm.org/">

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW - Inner product of two vectors gives inefficient code"

   href="https://bugs.llvm.org/show_bug.cgi?id=32164">32164</a>

          </td>

        </tr>


        <tr>

          <th>Summary</th>

          <td>Inner product of two vectors gives inefficient code

          </td>

        </tr>


        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>


        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>


        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>


        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>


        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>


        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>


        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>


        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>


        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>


        <tr>

          <th>Reporter</th>

          <td>drraph@gmail.com

          </td>

        </tr>


        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Consider:


typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;

typedef double  __attribute__( ( vector_size( 32 ) ) ) float64x4_t;


float dotf(float32x4_t x, float32x4_t y) {

  float ret=0;

  for (int i=0;i!=4;++i) ret+=x[i]*y[i];

  return ret;

}


double dotd(float64x4_t x, float64x4_t y) {

  double ret=0;

  for (int i=0;i!=4;++i) ret+=x[i]*y[i];

  return ret;

}


clang trunk with  -Ofast -march=corei7  gives:


dotf:                                   # @dotf

        mulps   xmm0, xmm1

        shufps  xmm0, xmm0, 27          # xmm0 = xmm0[3,2,1,0]

        movaps  xmm1, xmm0

        movhlps xmm1, xmm1              # xmm1 = xmm1[1,1]

        addps   xmm1, xmm0

        haddps  xmm1, xmm1

        movaps  xmm0, xmm1

        ret


dotd:                                   # @dotd

        push    rbp

        mov     rbp, rsp

        and     rsp, -32

        sub     rsp, 32

        movapd  xmm1, xmmword ptr [rbp + 16]

        movapd  xmm0, xmmword ptr [rbp + 32]

        mulpd   xmm1, xmmword ptr [rbp + 48]

        mulpd   xmm0, xmmword ptr [rbp + 64]

        addpd   xmm0, xmm1

        shufpd  xmm0, xmm0, 1           # xmm0 = xmm0[1,0]

        haddpd  xmm0, xmm0

        mov     rsp, rbp

        pop     rbp

        ret


gcc gives more efficient code using the same flags:


dotf:

        mulps   xmm0, xmm1

        haddps  xmm0, xmm0

        haddps  xmm0, xmm0

        ret

dotd:

        movapd  xmm0, XMMWORD PTR [rsp+8]

        movapd  xmm1, XMMWORD PTR [rsp+56]

        mulpd   xmm0, XMMWORD PTR [rsp+40]

        mulpd   xmm1, XMMWORD PTR [rsp+24]

        addpd   xmm0, xmm1

        haddpd  xmm0, xmm0

        ret


In particular, clang appears to have an unnecessary prologue and epilogue and

an extra shuffle in the case of "double" and an extra shuffle and some more

work in the case of "float".</pre>

        </div>

      </p>


      <hr>

      <span>You are receiving this mail because:</span>


      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>