<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - clang/llvm vectorize the sum of a complex array poorly"

   href="https://llvm.org/bugs/show_bug.cgi?id=31800">31800</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>clang/llvm vectorize the sum of a complex array poorly

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>3.9

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Loop Optimizer

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>drraph@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Consider this code:

#include <complex.h>

complex float f(complex float x[]) {

  complex float p = 1.0;

  for (int i = 0; i < 32; i++)

    p += x[i];

  return p;

}

clang 3.9.1 with -O3 -march=core-avx2 -ffast-math  gives

f:                                      # @f

        vmovq   xmm0, qword ptr [rdi]   # xmm0 = mem[0],zero

        vmovss  xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero

        vaddps  xmm0, xmm0, xmm1

        vmovq   xmm1, qword ptr [rdi + 8] # xmm1 = mem[0],zero

        vmovq   xmm2, qword ptr [rdi + 16] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vaddps  xmm0, xmm0, xmm1

        vmovq   xmm1, qword ptr [rdi + 24] # xmm1 = mem[0],zero

        vmovq   xmm2, qword ptr [rdi + 32] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 40] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vaddps  xmm0, xmm0, xmm1

        vmovq   xmm1, qword ptr [rdi + 48] # xmm1 = mem[0],zero

        vmovq   xmm2, qword ptr [rdi + 56] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 64] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 72] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vaddps  xmm0, xmm0, xmm1

        vmovq   xmm1, qword ptr [rdi + 80] # xmm1 = mem[0],zero

        vmovq   xmm2, qword ptr [rdi + 88] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 96] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 104] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 112] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vaddps  xmm0, xmm0, xmm1

        vmovq   xmm1, qword ptr [rdi + 120] # xmm1 = mem[0],zero

        vmovq   xmm2, qword ptr [rdi + 128] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 136] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 144] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 152] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 160] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vaddps  xmm0, xmm0, xmm1

        vmovq   xmm1, qword ptr [rdi + 168] # xmm1 = mem[0],zero

        vmovq   xmm2, qword ptr [rdi + 176] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 184] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 192] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 200] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 208] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 216] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vaddps  xmm0, xmm0, xmm1

        vmovq   xmm1, qword ptr [rdi + 224] # xmm1 = mem[0],zero

        vmovq   xmm2, qword ptr [rdi + 232] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 240] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vmovq   xmm2, qword ptr [rdi + 248] # xmm2 = mem[0],zero

        vaddps  xmm1, xmm1, xmm2

        vaddps  xmm0, xmm0, xmm1

        ret

The only vectorization is that the real and the imaginary parts are added in 

parallel.  The assembly also wastes half of the xmm register.

However in icc you get:

f:

        vmovups   ymm1, YMMWORD PTR [rdi]                       #5.10

        vmovups   ymm2, YMMWORD PTR [64+rdi]                    #5.10

        vmovups   ymm5, YMMWORD PTR [128+rdi]                   #5.10

        vmovups   ymm6, YMMWORD PTR [192+rdi]                   #5.10

        vmovsd    xmm0, QWORD PTR p.152.0.0.1[rip]              #3.19

        vaddps    ymm3, ymm1, YMMWORD PTR [32+rdi]              #3.19

        vaddps    ymm4, ymm2, YMMWORD PTR [96+rdi]              #3.19

        vaddps    ymm7, ymm5, YMMWORD PTR [160+rdi]             #3.19

        vaddps    ymm8, ymm6, YMMWORD PTR [224+rdi]             #3.19

        vaddps    ymm9, ymm3, ymm4                              #3.19

        vaddps    ymm10, ymm7, ymm8                             #3.19

        vaddps    ymm11, ymm9, ymm10                            #3.19

        vextractf128 xmm12, ymm11, 1                            #3.19

        vaddps    xmm13, xmm11, xmm12                           #3.19

        vmovhlps  xmm14, xmm13, xmm13                           #3.19

        vaddps    xmm15, xmm13, xmm14                           #3.19

        vaddps    xmm0, xmm15, xmm0                             #3.19

        vzeroupper                                              #6.10

        ret     

which is fully vectorized (and uses the wider ymm registers).

Another key difference seems to be that in the clang/llvm produced assembly

subsequent additions depend on each other.  Whereas in the icc code the

additions work on subsequent items and so it benefits both from full

vectorization and superscalar parallelism.

(This report is related to <a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - clang/llvm fails to vectorize the product of a complex array"

   href="show_bug.cgi?id=31677">https://llvm.org/bugs/show_bug.cgi?id=31677</a> where I

incorrectly stated at the end of the problem report that llvm could vectorise

this additive reduction loop.)</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>