<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Single complex addition in a loop with one iteration gives inefficient code"

   href="https://llvm.org/bugs/show_bug.cgi?id=31857">31857</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Single complex addition in a loop with one iteration gives inefficient code

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>new-bugs

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>new bugs

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>drraph@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Consider

#include <complex.h>

complex float f(complex float x[]) {

  complex float p = 1.0;

  for (int i = 0; i < 1; i++)

    p += 2*x[i];

  return p;

}

This code is simply doubling one complex float and adding 1. 

In clang trunk with -O3  -march=core-avx2 you get

f:                                      # @f

        vmovss  xmm3, dword ptr [rdi + 4] # xmm3 = mem[0],zero,zero,zero

        vbroadcastss    xmm0, xmm3

        vmulps  xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]

        vmovss  xmm2, dword ptr [rdi]   # xmm2 = mem[0],zero,zero,zero

        vbroadcastss    xmm1, xmm2

        vmovss  xmm4, dword ptr [rip + .LCPI0_1] # xmm4 = mem[0],zero,zero,zero

        vmulps  xmm1, xmm1, xmm4

        vsubps  xmm4, xmm1, xmm0

        vaddps  xmm1, xmm1, xmm0

        vblendps        xmm0, xmm4, xmm1, 2 # xmm0 = xmm4[0],xmm1[1],xmm4[2,3]

        vucomiss        xmm4, xmm4

        jnp     .LBB0_3

        vmovshdup       xmm1, xmm1      # xmm1 = xmm1[1,1,3,3]

        vucomiss        xmm1, xmm1

        jp      .LBB0_2

.LBB0_3:

        vmovss  xmm1, dword ptr [rip + .LCPI0_2] # xmm1 = mem[0],zero,zero,zero

        vaddps  xmm0, xmm0, xmm1

        ret

.LBB0_2:

        push    rax

        vmovss  xmm0, dword ptr [rip + .LCPI0_1] # xmm0 = mem[0],zero,zero,zero

        vxorps  xmm1, xmm1, xmm1

        call    __mulsc3

        add     rsp, 8

        jmp     .LBB0_3

Using the Intel Compiler with -O3  -march=core-avx2  -fp-model strict you get:

f:

        vmovsd    xmm0, QWORD PTR [rdi]                         #5.12

        vmulps    xmm2, xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.12

        vmovsd    xmm1, QWORD PTR p.152.0.0.1[rip]              #3.19

        vaddps    xmm0, xmm1, xmm2                              #5.5

        ret    

as expected.

The -fp-model strict tells the compiler to strictly adhere to value-safe

optimizations when implementing floating-point calculations and enables

floating-point exception semantics. It also turns off fuse add multiply which

might not be relevant here.

If you turn on -ffast-math in clang trunk you do get much better although still

not ideal code:

f:                                      # @f

        vmovss  xmm0, dword ptr [rdi]   # xmm0 = mem[0],zero,zero,zero

        vmovss  xmm1, dword ptr [rdi + 4] # xmm1 = mem[0],zero,zero,zero

        vaddss  xmm1, xmm1, xmm1

        vmovss  xmm2, dword ptr [rip + .LCPI0_0] # xmm2 = mem[0],zero,zero,zero

        vfmadd213ss     xmm2, xmm0, dword ptr [rip + .LCPI0_1]

        vinsertps       xmm0, xmm2, xmm1, 16 # xmm0 = xmm2[0],xmm1[0],xmm2[2,3]

        ret</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>