<html>
    <head>
      <base href="https://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - Squaring a complex float gives inefficient code"
   href="https://llvm.org/bugs/show_bug.cgi?id=31866">31866</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Squaring a complex float gives inefficient code
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>new-bugs
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>new bugs
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>drraph@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Consider:

#include <complex.h>
complex float f(complex float x) {
  return x*x;
}

clang trunk with -O3  -march=core-avx2 gives

f:                                      # @f
        vmovaps xmm2, xmm0
        vmovshdup       xmm1, xmm2      # xmm1 = xmm2[1,1,3,3]
        vmulss  xmm0, xmm2, xmm2
        vmulss  xmm3, xmm1, xmm1
        vmulss  xmm4, xmm2, xmm1
        vsubss  xmm0, xmm0, xmm3
        vaddss  xmm3, xmm4, xmm4
        vucomiss        xmm0, xmm0
        jnp     .LBB0_3
        vucomiss        xmm3, xmm3
        jp      .LBB0_2
.LBB0_3:
        vinsertps       xmm0, xmm0, xmm3, 16 # xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
        ret
.LBB0_2:
        push    rax
        vmovaps xmm0, xmm2
        vmovaps xmm3, xmm1
        call    __mulsc3
        vmovshdup       xmm3, xmm0      # xmm3 = xmm0[1,1,3,3]
        add     rsp, 8
        vinsertps       xmm0, xmm0, xmm3, 16 # xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
        ret

The Intel C compiler with -fp-model strict gives

f:
        vmovsldup xmm1, xmm0                                    #3.12
        vmovshdup xmm2, xmm0                                    #3.12
        vshufps   xmm3, xmm0, xmm0, 177                         #3.12
        vmulps    xmm4, xmm1, xmm0                              #3.12
        vmulps    xmm5, xmm2, xmm3                              #3.12
        vaddsubps xmm0, xmm4, xmm5                              #3.12
        ret 

"strict" should be value safe, turn on floating point exception semantics and
also disable fuse multiply add.  "precise" is the setting to use if you just
want it to be value safe.  -fp-model precise -fp-model except gives

f:
        vmovshdup xmm1, xmm0                                    #3.12
        vshufps   xmm2, xmm0, xmm0, 177                         #3.12
        vmulps    xmm4, xmm1, xmm2                              #3.12
        vmovsldup xmm3, xmm0                                    #3.12
        vfmaddsub213ps xmm0, xmm3, xmm4                         #3.12
        ret  

gcc 7 gives code that is shorter than clang does but still call __mulsc3 .


f:
        vmovq   QWORD PTR [rsp-16], xmm0
        vmovss  xmm3, DWORD PTR [rsp-12]
        vmovss  xmm2, DWORD PTR [rsp-16]
        vmovaps xmm1, xmm3
        vmovaps xmm0, xmm2
        jmp     __mulsc3


If you enable -ffast-math in clang it is much better although not quite optimal
with:

f:                                      # @f
        vmovshdup       xmm1, xmm0      # xmm1 = xmm0[1,1,3,3]
        vaddss  xmm2, xmm0, xmm0
        vmulss  xmm2, xmm1, xmm2
        vmulss  xmm1, xmm1, xmm1
        vfmsub231ss     xmm1, xmm0, xmm0
        vinsertps       xmm0, xmm1, xmm2, 16 # xmm0 = xmm1[0],xmm2[0],xmm1[2,3]
        ret

>From my non-expert eyes it seems there are two questions:

1) In the "no fast-math" case is ICC actually  meeting the C99 specs?
2) In the "fast-math" case can clang/llvm be persuaded/changed to use one call
to vmulps instead of two calls to vmulss?</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>