<html>
    <head>
      <base href="https://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - Complex division is not optimised with -ffast-math"
   href="https://llvm.org/bugs/show_bug.cgi?id=31872">31872</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Complex division is not optimised with -ffast-math
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>new-bugs
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>new bugs
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>drraph@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Consider:

#include <complex.h>
complex float f(complex float x, complex float y) {
  return x/y;
}

clang trunk with -O3 -march=core-avx2 but with or without -ffast-math gives:

f:                                      # @f
        vmovaps xmm2, xmm1
        vmovshdup       xmm1, xmm0      # xmm1 = xmm0[1,1,3,3]
        vmovshdup       xmm3, xmm2      # xmm3 = xmm2[1,1,3,3]
        jmp     __divsc3                # TAILCALL


However both gcc and ICC attempt to optimise this code when -ffast-math (or
equivalent) is enabled.

ICC appears to give the fastest code which is:

f:
        vcvtps2pd xmm2, xmm1                                    #3.12
        vcvtps2pd xmm4, xmm0                                    #3.12
        vmulpd    xmm8, xmm2, xmm2                              #3.12
        vunpckhpd xmm3, xmm2, xmm2                              #3.12
        vmulpd    xmm6, xmm3, xmm4                              #3.12
        vmovddup  xmm7, xmm2                                    #3.12
        vshufpd   xmm5, xmm4, xmm4, 1                           #3.12
        vshufpd   xmm9, xmm8, xmm8, 1                           #3.12
        vfmaddsub213pd xmm7, xmm5, xmm6                         #3.12
        vaddpd    xmm11, xmm8, xmm9                             #3.12
        vshufpd   xmm10, xmm7, xmm7, 1                          #3.12
        vdivpd    xmm12, xmm10, xmm11                           #3.12
        vcvtpd2ps xmm0, xmm12                                   #3.12
        ret</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>