<html>

    <head>

      <base href="https://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Performance regression summing small float array"

   href="https://llvm.org/bugs/show_bug.cgi?id=28002">28002</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Performance regression summing small float array

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>3.8

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Linux

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>normal

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Backend: X86

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>yyc1992@gmail.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvm-bugs@lists.llvm.org

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>The following loop takes 20-60% (depending on the machine) longer to run when

compiled with clang 3.8+ compare to 3.7.1 (optimization level `-Ofast`,

`-march=core-avx2`) for a cacheline aligned array of 1024 floats. When the

array is much larger so that it doesn't fit in the cache and the loop is memory

bandwidth limited there's not performance difference anymore. (FWIW, somehow

both are much faster than GCC 6.1...)

```c

__attribute__((noinline)) float sum32(float *a, size_t n)

{

    /* a = (float*)__builtin_assume_aligned(a, 64); */

    float s = 0;

    for (size_t i = 0;i < n;i++)

        s += a[i];

    return s;

}

```

The C code, llvm ir and assembly output on 3.7 and 3.8 are available in [this

gist](<a href="https://gist.github.com/yuyichao/5b07f71c1f19248ec5511d758532a4b0">https://gist.github.com/yuyichao/5b07f71c1f19248ec5511d758532a4b0</a>) The

difference in assembly is also pasted below.

```diff

--- llvm37.s    2016-06-04 13:23:34.947819989 -0400

+++ llvm38.s    2016-06-04 13:14:25.455283889 -0400

@@ -4,103 +4,95 @@

         vxorps  %xmm0, %xmm0, %xmm0

         testq   %rsi, %rsi

         je      .LBB1_13

-# BB#1:                                 # %overflow.checked

+# BB#1:                                 # %.lr.ph.preheader

+        vxorps  %xmm0, %xmm0, %xmm0

+        xorl    %ecx, %ecx

+        cmpq    $15, %rsi

+        jbe     .LBB1_2

+# BB#4:                                 # %min.iters.checked

         xorl    %ecx, %ecx

         movq    %rsi, %rax

-        vxorps  %ymm0, %ymm0, %ymm0

-        vxorps  %ymm1, %ymm1, %ymm1

-        vxorps  %ymm2, %ymm2, %ymm2

-        vxorps  %ymm3, %ymm3, %ymm3

-        andq    $-32, %rax

-        je      .LBB1_10

-# BB#2:                                 # %vector.body.preheader

-        leaq    -32(%rsi), %r8

+        andq    $-16, %rax

+        je      .LBB1_2

+# BB#5:                                 # %vector.body.preheader

+        leaq    -16(%rsi), %r8

         movl    %r8d, %ecx

-        shrl    $5, %ecx

+        shrl    $4, %ecx

         addl    $1, %ecx

         xorl    %edx, %edx

-        testb   $3, %cl

-        je      .LBB1_3

-# BB#4:                                 # %vector.body.prol.preheader

-        leal    -32(%rsi), %ecx

-        shrl    $5, %ecx

+        testb   $7, %cl

+        je      .LBB1_6

+# BB#7:                                 # %vector.body.prol.preheader

+        leal    -16(%rsi), %ecx

+        shrl    $4, %ecx

         addl    $1, %ecx

-        andl    $3, %ecx

+        andl    $7, %ecx

         negq    %rcx

         vxorps  %ymm0, %ymm0, %ymm0

         xorl    %edx, %edx

         vxorps  %ymm1, %ymm1, %ymm1

-        vxorps  %ymm2, %ymm2, %ymm2

-        vxorps  %ymm3, %ymm3, %ymm3

         .align  16, 0x90

-.LBB1_5:                                # %vector.body.prol

+.LBB1_8:                                # %vector.body.prol

                                         # =>This Inner Loop Header: Depth=1

         vaddps  (%rdi,%rdx,4), %ymm0, %ymm0

         vaddps  32(%rdi,%rdx,4), %ymm1, %ymm1

-        vaddps  64(%rdi,%rdx,4), %ymm2, %ymm2

-        vaddps  96(%rdi,%rdx,4), %ymm3, %ymm3

-        addq    $32, %rdx

+        addq    $16, %rdx

         addq    $1, %rcx

-        jne     .LBB1_5

-        jmp     .LBB1_6

-.LBB1_3:

+        jne     .LBB1_8

+        jmp     .LBB1_9

+.LBB1_6:

         vxorps  %ymm0, %ymm0, %ymm0

         vxorps  %ymm1, %ymm1, %ymm1

-        vxorps  %ymm2, %ymm2, %ymm2

-        vxorps  %ymm3, %ymm3, %ymm3

-.LBB1_6:                                # %vector.body.preheader.split

-        cmpq    $96, %r8

-        jb      .LBB1_9

-# BB#7:                                 # %vector.body.preheader.split.split

+.LBB1_9:                                # %vector.body.preheader.split

+        cmpq    $112, %r8

+        jb      .LBB1_12

+# BB#10:                                # %vector.body.preheader.split.split

         movq    %rsi, %rcx

-        andq    $-32, %rcx

+        andq    $-16, %rcx

         subq    %rdx, %rcx

         leaq    480(%rdi,%rdx,4), %rdx

         .align  16, 0x90

-.LBB1_8:                                # %vector.body

+.LBB1_11:                               # %vector.body

                                         # =>This Inner Loop Header: Depth=1

         vaddps  -480(%rdx), %ymm0, %ymm0

         vaddps  -448(%rdx), %ymm1, %ymm1

-        vaddps  -416(%rdx), %ymm2, %ymm2

-        vaddps  -384(%rdx), %ymm3, %ymm3

+        vaddps  -416(%rdx), %ymm0, %ymm0

+        vaddps  -384(%rdx), %ymm1, %ymm1

         vaddps  -352(%rdx), %ymm0, %ymm0

         vaddps  -320(%rdx), %ymm1, %ymm1

-        vaddps  -288(%rdx), %ymm2, %ymm2

-        vaddps  -256(%rdx), %ymm3, %ymm3

+        vaddps  -288(%rdx), %ymm0, %ymm0

+        vaddps  -256(%rdx), %ymm1, %ymm1

         vaddps  -224(%rdx), %ymm0, %ymm0

         vaddps  -192(%rdx), %ymm1, %ymm1

-        vaddps  -160(%rdx), %ymm2, %ymm2

-        vaddps  -128(%rdx), %ymm3, %ymm3

+        vaddps  -160(%rdx), %ymm0, %ymm0

+        vaddps  -128(%rdx), %ymm1, %ymm1

         vaddps  -96(%rdx), %ymm0, %ymm0

         vaddps  -64(%rdx), %ymm1, %ymm1

-        vaddps  -32(%rdx), %ymm2, %ymm2

-        vaddps  (%rdx), %ymm3, %ymm3

+        vaddps  -32(%rdx), %ymm0, %ymm0

+        vaddps  (%rdx), %ymm1, %ymm1

         addq    $512, %rdx              # imm = 0x200

         addq    $-128, %rcx

-        jne     .LBB1_8

-.LBB1_9:

-        movq    %rax, %rcx

-.LBB1_10:                               # %middle.block

+        jne     .LBB1_11

+.LBB1_12:                               # %middle.block

         vaddps  %ymm0, %ymm1, %ymm0

-        vaddps  %ymm0, %ymm2, %ymm0

-        vaddps  %ymm0, %ymm3, %ymm0

         vextractf128    $1, %ymm0, %xmm1

         vaddps  %ymm1, %ymm0, %ymm0

-        vpermilpd       $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]

+        vpermilpd       $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]

         vaddps  %ymm1, %ymm0, %ymm0

         vhaddps %ymm0, %ymm0, %ymm0

-        cmpq    %rsi, %rcx

+        movq    %rax, %rcx

+        cmpq    %rsi, %rax

         je      .LBB1_13

-# BB#11:                                # %.lr.ph.preheader

+.LBB1_2:                                # %.lr.ph.preheader13

         leaq    (%rdi,%rcx,4), %rax

         subq    %rcx, %rsi

         .align  16, 0x90

-.LBB1_12:                               # %.lr.ph

+.LBB1_3:                                # %.lr.ph

                                         # =>This Inner Loop Header: Depth=1

         vaddss  (%rax), %xmm0, %xmm0

         addq    $4, %rax

         addq    $-1, %rsi

-        jne     .LBB1_12

+        jne     .LBB1_3

 .LBB1_13:                               # %._crit_edge

         #APP

         #NO_APP

```</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>