<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Squaring a complex float gives inefficient code"

   href="https://llvm.org/bugs/show_bug.cgi?id=31866">31866</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Squaring a complex float gives inefficient code

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>drraph@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Consider:

#include <complex.h>

complex float f(complex float x) {

  return x*x;

}

clang trunk with -O3  -march=core-avx2 gives

f:                                      # @f

        vmovaps xmm2, xmm0

        vmovshdup       xmm1, xmm2      # xmm1 = xmm2[1,1,3,3]

        vmulss  xmm0, xmm2, xmm2

        vmulss  xmm3, xmm1, xmm1

        vmulss  xmm4, xmm2, xmm1

        vsubss  xmm0, xmm0, xmm3

        vaddss  xmm3, xmm4, xmm4

        vucomiss        xmm0, xmm0

        jnp     .LBB0_3

        vucomiss        xmm3, xmm3

        jp      .LBB0_2

.LBB0_3:

        vinsertps       xmm0, xmm0, xmm3, 16 # xmm0 = xmm0[0],xmm3[0],xmm0[2,3]

        ret

.LBB0_2:

        push    rax

        vmovaps xmm0, xmm2

        vmovaps xmm3, xmm1

        call    __mulsc3

        vmovshdup       xmm3, xmm0      # xmm3 = xmm0[1,1,3,3]

        add     rsp, 8

        vinsertps       xmm0, xmm0, xmm3, 16 # xmm0 = xmm0[0],xmm3[0],xmm0[2,3]

        ret

The Intel C compiler with -fp-model strict gives

f:

        vmovsldup xmm1, xmm0                                    #3.12

        vmovshdup xmm2, xmm0                                    #3.12

        vshufps   xmm3, xmm0, xmm0, 177                         #3.12

        vmulps    xmm4, xmm1, xmm0                              #3.12

        vmulps    xmm5, xmm2, xmm3                              #3.12

        vaddsubps xmm0, xmm4, xmm5                              #3.12

        ret 

"strict" should be value safe, turn on floating point exception semantics and

also disable fuse multiply add.  "precise" is the setting to use if you just

want it to be value safe.  -fp-model precise -fp-model except gives

f:

        vmovshdup xmm1, xmm0                                    #3.12

        vshufps   xmm2, xmm0, xmm0, 177                         #3.12

        vmulps    xmm4, xmm1, xmm2                              #3.12

        vmovsldup xmm3, xmm0                                    #3.12

        vfmaddsub213ps xmm0, xmm3, xmm4                         #3.12

        ret  

gcc 7 gives code that is shorter than clang does but still call __mulsc3 .

f:

        vmovq   QWORD PTR [rsp-16], xmm0

        vmovss  xmm3, DWORD PTR [rsp-12]

        vmovss  xmm2, DWORD PTR [rsp-16]

        vmovaps xmm1, xmm3

        vmovaps xmm0, xmm2

        jmp     __mulsc3

If you enable -ffast-math in clang it is much better although not quite optimal

with:

f:                                      # @f

        vmovshdup       xmm1, xmm0      # xmm1 = xmm0[1,1,3,3]

        vaddss  xmm2, xmm0, xmm0

        vmulss  xmm2, xmm1, xmm2

        vmulss  xmm1, xmm1, xmm1

        vfmsub231ss     xmm1, xmm0, xmm0

        vinsertps       xmm0, xmm1, xmm2, 16 # xmm0 = xmm1[0],xmm2[0],xmm1[2,3]

        ret

>From my non-expert eyes it seems there are two questions:

1) In the "no fast-math" case is ICC actually  meeting the C99 specs?

2) In the "fast-math" case can clang/llvm be persuaded/changed to use one call

to vmulps instead of two calls to vmulss?</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>