<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Simple addition reduction not vectorised when limit is 48 or less."

   href="https://llvm.org/bugs/show_bug.cgi?id=31690">31690</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Simple addition reduction not vectorised when limit is 48 or less.

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Loop Optimizer

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>drraph@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>Consider this simple loop

float f(float x[]) {

  float p = 1.0;

  for (int i = 0; i < 48; i++)

    p += x[i];

  return p;

}

clang 3.9.1 gives

.LCPI0_0:

        .long   1065353216              # float 1

f:                                      # @f

        vmovss  xmm0, dword ptr [rdi]   # xmm0 = mem[0],zero,zero,zero

        vaddss  xmm0, xmm0, dword ptr [rip + .LCPI0_0]

        vaddss  xmm0, xmm0, dword ptr [rdi + 4]

        vaddss  xmm0, xmm0, dword ptr [rdi + 8]

        vaddss  xmm0, xmm0, dword ptr [rdi + 12]

        vaddss  xmm0, xmm0, dword ptr [rdi + 16]

        vaddss  xmm0, xmm0, dword ptr [rdi + 20]

        vaddss  xmm0, xmm0, dword ptr [rdi + 24]

        vaddss  xmm0, xmm0, dword ptr [rdi + 28]

        vaddss  xmm0, xmm0, dword ptr [rdi + 32]

        vaddss  xmm0, xmm0, dword ptr [rdi + 36]

        vaddss  xmm0, xmm0, dword ptr [rdi + 40]

        vaddss  xmm0, xmm0, dword ptr [rdi + 44]

        vaddss  xmm0, xmm0, dword ptr [rdi + 48]

        vaddss  xmm0, xmm0, dword ptr [rdi + 52]

        vaddss  xmm0, xmm0, dword ptr [rdi + 56]

        vaddss  xmm0, xmm0, dword ptr [rdi + 60]

        vaddss  xmm0, xmm0, dword ptr [rdi + 64]

        vaddss  xmm0, xmm0, dword ptr [rdi + 68]

        vaddss  xmm0, xmm0, dword ptr [rdi + 72]

        vaddss  xmm0, xmm0, dword ptr [rdi + 76]

        vaddss  xmm0, xmm0, dword ptr [rdi + 80]

        vaddss  xmm0, xmm0, dword ptr [rdi + 84]

        vaddss  xmm0, xmm0, dword ptr [rdi + 88]

        vaddss  xmm0, xmm0, dword ptr [rdi + 92]

        vaddss  xmm0, xmm0, dword ptr [rdi + 96]

        vaddss  xmm0, xmm0, dword ptr [rdi + 100]

        vaddss  xmm0, xmm0, dword ptr [rdi + 104]

        vaddss  xmm0, xmm0, dword ptr [rdi + 108]

        vaddss  xmm0, xmm0, dword ptr [rdi + 112]

        vaddss  xmm0, xmm0, dword ptr [rdi + 116]

        vaddss  xmm0, xmm0, dword ptr [rdi + 120]

        vaddss  xmm0, xmm0, dword ptr [rdi + 124]

        vaddss  xmm0, xmm0, dword ptr [rdi + 128]

        vaddss  xmm0, xmm0, dword ptr [rdi + 132]

        vaddss  xmm0, xmm0, dword ptr [rdi + 136]

        vaddss  xmm0, xmm0, dword ptr [rdi + 140]

        vaddss  xmm0, xmm0, dword ptr [rdi + 144]

        vaddss  xmm0, xmm0, dword ptr [rdi + 148]

        vaddss  xmm0, xmm0, dword ptr [rdi + 152]

        vaddss  xmm0, xmm0, dword ptr [rdi + 156]

        vaddss  xmm0, xmm0, dword ptr [rdi + 160]

        vaddss  xmm0, xmm0, dword ptr [rdi + 164]

        vaddss  xmm0, xmm0, dword ptr [rdi + 168]

        vaddss  xmm0, xmm0, dword ptr [rdi + 172]

        vaddss  xmm0, xmm0, dword ptr [rdi + 176]

        vaddss  xmm0, xmm0, dword ptr [rdi + 180]

        vaddss  xmm0, xmm0, dword ptr [rdi + 184]

        vaddss  xmm0, xmm0, dword ptr [rdi + 188]

        ret

This is fully unrolled but not vectorized.

However, using icc we get:

f:

        movups    xmm13, XMMWORD PTR [rdi]                      #4.10

        movups    xmm0, XMMWORD PTR [16+rdi]                    #4.10

        movups    xmm6, XMMWORD PTR [32+rdi]                    #4.10

        movups    xmm1, XMMWORD PTR [48+rdi]                    #4.10

        movups    xmm9, XMMWORD PTR [64+rdi]                    #4.10

        movups    xmm2, XMMWORD PTR [80+rdi]                    #4.10

        movups    xmm7, XMMWORD PTR [96+rdi]                    #4.10

        movups    xmm3, XMMWORD PTR [112+rdi]                   #4.10

        movups    xmm10, XMMWORD PTR [128+rdi]                  #4.10

        movups    xmm4, XMMWORD PTR [144+rdi]                   #4.10

        movups    xmm8, XMMWORD PTR [160+rdi]                   #4.10

        movups    xmm5, XMMWORD PTR [176+rdi]                   #4.10

        addps     xmm13, xmm0                                   #2.11

        addps     xmm6, xmm1                                    #2.11

        addps     xmm9, xmm2                                    #2.11

        addps     xmm7, xmm3                                    #2.11

        addps     xmm10, xmm4                                   #2.11

        addps     xmm8, xmm5                                    #2.11

        addps     xmm13, xmm6                                   #2.11

        addps     xmm9, xmm7                                    #2.11

        addps     xmm10, xmm8                                   #2.11

        addps     xmm13, xmm9                                   #2.11

        addps     xmm13, xmm10                                  #2.11

        movaps    xmm11, xmm13                                  #2.11

        movhlps   xmm11, xmm13                                  #2.11

        addps     xmm13, xmm11                                  #2.11

        movaps    xmm12, xmm13                                  #2.11

        shufps    xmm12, xmm13, 245                             #2.11

        addss     xmm13, xmm12                                  #2.11

        addss     xmm13, DWORD PTR .L_2il0floatpacket.0[rip]    #2.11

        movaps    xmm0, xmm13                                   #5.10

        ret                                                     #5.10

.L_2il0floatpacket.0:

        .long   0x3f800000

This is both vectorised and unrolled.

If you increase the loop limit from 48 to 64, say, then clang/llvm stops

unrolling and does manage to vectorise the code.</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>