<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Complex division is not optimised with -ffast-math"

   href="https://llvm.org/bugs/show_bug.cgi?id=31872">31872</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Complex division is not optimised with -ffast-math

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>drraph@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Consider:

#include <complex.h>

complex float f(complex float x, complex float y) {

  return x/y;

}

clang trunk with -O3 -march=core-avx2 but with or without -ffast-math gives:

f:                                      # @f

        vmovaps xmm2, xmm1

        vmovshdup       xmm1, xmm0      # xmm1 = xmm0[1,1,3,3]

        vmovshdup       xmm3, xmm2      # xmm3 = xmm2[1,1,3,3]

        jmp     __divsc3                # TAILCALL

However both gcc and ICC attempt to optimise this code when -ffast-math (or

equivalent) is enabled.

ICC appears to give the fastest code which is:

f:

        vcvtps2pd xmm2, xmm1                                    #3.12

        vcvtps2pd xmm4, xmm0                                    #3.12

        vmulpd    xmm8, xmm2, xmm2                              #3.12

        vunpckhpd xmm3, xmm2, xmm2                              #3.12

        vmulpd    xmm6, xmm3, xmm4                              #3.12

        vmovddup  xmm7, xmm2                                    #3.12

        vshufpd   xmm5, xmm4, xmm4, 1                           #3.12

        vshufpd   xmm9, xmm8, xmm8, 1                           #3.12

        vfmaddsub213pd xmm7, xmm5, xmm6                         #3.12

        vaddpd    xmm11, xmm8, xmm9                             #3.12

        vshufpd   xmm10, xmm7, xmm7, 1                          #3.12

        vdivpd    xmm12, xmm10, xmm11                           #3.12

        vcvtpd2ps xmm0, xmm12                                   #3.12

        ret</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>