<html>
    <head>
      <base href="https://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - Single complex addition in a loop with one iteration gives inefficient code"
   href="https://llvm.org/bugs/show_bug.cgi?id=31857">31857</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>Single complex addition in a loop with one iteration gives inefficient code
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>new-bugs
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>Linux
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>new bugs
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>drraph@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvm-bugs@lists.llvm.org
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Consider

#include <complex.h>
complex float f(complex float x[]) {
  complex float p = 1.0;
  for (int i = 0; i < 1; i++)
    p += 2*x[i];
  return p;
}

This code is simply doubling one complex float and adding 1. 

In clang trunk with -O3  -march=core-avx2 you get

f:                                      # @f
        vmovss  xmm3, dword ptr [rdi + 4] # xmm3 = mem[0],zero,zero,zero
        vbroadcastss    xmm0, xmm3
        vmulps  xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        vmovss  xmm2, dword ptr [rdi]   # xmm2 = mem[0],zero,zero,zero
        vbroadcastss    xmm1, xmm2
        vmovss  xmm4, dword ptr [rip + .LCPI0_1] # xmm4 = mem[0],zero,zero,zero
        vmulps  xmm1, xmm1, xmm4
        vsubps  xmm4, xmm1, xmm0
        vaddps  xmm1, xmm1, xmm0
        vblendps        xmm0, xmm4, xmm1, 2 # xmm0 = xmm4[0],xmm1[1],xmm4[2,3]
        vucomiss        xmm4, xmm4
        jnp     .LBB0_3
        vmovshdup       xmm1, xmm1      # xmm1 = xmm1[1,1,3,3]
        vucomiss        xmm1, xmm1
        jp      .LBB0_2
.LBB0_3:
        vmovss  xmm1, dword ptr [rip + .LCPI0_2] # xmm1 = mem[0],zero,zero,zero
        vaddps  xmm0, xmm0, xmm1
        ret
.LBB0_2:
        push    rax
        vmovss  xmm0, dword ptr [rip + .LCPI0_1] # xmm0 = mem[0],zero,zero,zero
        vxorps  xmm1, xmm1, xmm1
        call    __mulsc3
        add     rsp, 8
        jmp     .LBB0_3



Using the Intel Compiler with -O3  -march=core-avx2  -fp-model strict you get:

f:
        vmovsd    xmm0, QWORD PTR [rdi]                         #5.12
        vmulps    xmm2, xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.12
        vmovsd    xmm1, QWORD PTR p.152.0.0.1[rip]              #3.19
        vaddps    xmm0, xmm1, xmm2                              #5.5
        ret    

as expected.

The -fp-model strict tells the compiler to strictly adhere to value-safe
optimizations when implementing floating-point calculations and enables
floating-point exception semantics. It also turns off fuse add multiply which
might not be relevant here.

If you turn on -ffast-math in clang trunk you do get much better although still
not ideal code:

f:                                      # @f
        vmovss  xmm0, dword ptr [rdi]   # xmm0 = mem[0],zero,zero,zero
        vmovss  xmm1, dword ptr [rdi + 4] # xmm1 = mem[0],zero,zero,zero
        vaddss  xmm1, xmm1, xmm1
        vmovss  xmm2, dword ptr [rip + .LCPI0_0] # xmm2 = mem[0],zero,zero,zero
        vfmadd213ss     xmm2, xmm0, dword ptr [rip + .LCPI0_1]
        vinsertps       xmm0, xmm2, xmm1, 16 # xmm0 = xmm2[0],xmm1[0],xmm2[2,3]
        ret</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>