<html>
<head>
<base href="https://llvm.org/bugs/" />
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW --- - Performance regression summing small float array"
href="https://llvm.org/bugs/show_bug.cgi?id=28002">28002</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Performance regression summing small float array
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>3.8
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: X86
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>yyc1992@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org
</td>
</tr>
<tr>
<th>Classification</th>
<td>Unclassified
</td>
</tr></table>
<p>
<div>
<pre>The following loop takes 20-60% (depending on the machine) longer to run when
compiled with clang 3.8+ compare to 3.7.1 (optimization level `-Ofast`,
`-march=core-avx2`) for a cacheline aligned array of 1024 floats. When the
array is much larger so that it doesn't fit in the cache and the loop is memory
bandwidth limited there's not performance difference anymore. (FWIW, somehow
both are much faster than GCC 6.1...)
```c
__attribute__((noinline)) float sum32(float *a, size_t n)
{
/* a = (float*)__builtin_assume_aligned(a, 64); */
float s = 0;
for (size_t i = 0;i < n;i++)
s += a[i];
return s;
}
```
The C code, llvm ir and assembly output on 3.7 and 3.8 are available in [this
gist](<a href="https://gist.github.com/yuyichao/5b07f71c1f19248ec5511d758532a4b0">https://gist.github.com/yuyichao/5b07f71c1f19248ec5511d758532a4b0</a>) The
difference in assembly is also pasted below.
```diff
--- llvm37.s 2016-06-04 13:23:34.947819989 -0400
+++ llvm38.s 2016-06-04 13:14:25.455283889 -0400
@@ -4,103 +4,95 @@
vxorps %xmm0, %xmm0, %xmm0
testq %rsi, %rsi
je .LBB1_13
-# BB#1: # %overflow.checked
+# BB#1: # %.lr.ph.preheader
+ vxorps %xmm0, %xmm0, %xmm0
+ xorl %ecx, %ecx
+ cmpq $15, %rsi
+ jbe .LBB1_2
+# BB#4: # %min.iters.checked
xorl %ecx, %ecx
movq %rsi, %rax
- vxorps %ymm0, %ymm0, %ymm0
- vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
- andq $-32, %rax
- je .LBB1_10
-# BB#2: # %vector.body.preheader
- leaq -32(%rsi), %r8
+ andq $-16, %rax
+ je .LBB1_2
+# BB#5: # %vector.body.preheader
+ leaq -16(%rsi), %r8
movl %r8d, %ecx
- shrl $5, %ecx
+ shrl $4, %ecx
addl $1, %ecx
xorl %edx, %edx
- testb $3, %cl
- je .LBB1_3
-# BB#4: # %vector.body.prol.preheader
- leal -32(%rsi), %ecx
- shrl $5, %ecx
+ testb $7, %cl
+ je .LBB1_6
+# BB#7: # %vector.body.prol.preheader
+ leal -16(%rsi), %ecx
+ shrl $4, %ecx
addl $1, %ecx
- andl $3, %ecx
+ andl $7, %ecx
negq %rcx
vxorps %ymm0, %ymm0, %ymm0
xorl %edx, %edx
vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
.align 16, 0x90
-.LBB1_5: # %vector.body.prol
+.LBB1_8: # %vector.body.prol
# =>This Inner Loop Header: Depth=1
vaddps (%rdi,%rdx,4), %ymm0, %ymm0
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1
- vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2
- vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3
- addq $32, %rdx
+ addq $16, %rdx
addq $1, %rcx
- jne .LBB1_5
- jmp .LBB1_6
-.LBB1_3:
+ jne .LBB1_8
+ jmp .LBB1_9
+.LBB1_6:
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
-.LBB1_6: # %vector.body.preheader.split
- cmpq $96, %r8
- jb .LBB1_9
-# BB#7: # %vector.body.preheader.split.split
+.LBB1_9: # %vector.body.preheader.split
+ cmpq $112, %r8
+ jb .LBB1_12
+# BB#10: # %vector.body.preheader.split.split
movq %rsi, %rcx
- andq $-32, %rcx
+ andq $-16, %rcx
subq %rdx, %rcx
leaq 480(%rdi,%rdx,4), %rdx
.align 16, 0x90
-.LBB1_8: # %vector.body
+.LBB1_11: # %vector.body
# =>This Inner Loop Header: Depth=1
vaddps -480(%rdx), %ymm0, %ymm0
vaddps -448(%rdx), %ymm1, %ymm1
- vaddps -416(%rdx), %ymm2, %ymm2
- vaddps -384(%rdx), %ymm3, %ymm3
+ vaddps -416(%rdx), %ymm0, %ymm0
+ vaddps -384(%rdx), %ymm1, %ymm1
vaddps -352(%rdx), %ymm0, %ymm0
vaddps -320(%rdx), %ymm1, %ymm1
- vaddps -288(%rdx), %ymm2, %ymm2
- vaddps -256(%rdx), %ymm3, %ymm3
+ vaddps -288(%rdx), %ymm0, %ymm0
+ vaddps -256(%rdx), %ymm1, %ymm1
vaddps -224(%rdx), %ymm0, %ymm0
vaddps -192(%rdx), %ymm1, %ymm1
- vaddps -160(%rdx), %ymm2, %ymm2
- vaddps -128(%rdx), %ymm3, %ymm3
+ vaddps -160(%rdx), %ymm0, %ymm0
+ vaddps -128(%rdx), %ymm1, %ymm1
vaddps -96(%rdx), %ymm0, %ymm0
vaddps -64(%rdx), %ymm1, %ymm1
- vaddps -32(%rdx), %ymm2, %ymm2
- vaddps (%rdx), %ymm3, %ymm3
+ vaddps -32(%rdx), %ymm0, %ymm0
+ vaddps (%rdx), %ymm1, %ymm1
addq $512, %rdx # imm = 0x200
addq $-128, %rcx
- jne .LBB1_8
-.LBB1_9:
- movq %rax, %rcx
-.LBB1_10: # %middle.block
+ jne .LBB1_11
+.LBB1_12: # %middle.block
vaddps %ymm0, %ymm1, %ymm0
- vaddps %ymm0, %ymm2, %ymm0
- vaddps %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddps %ymm1, %ymm0, %ymm0
- vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
+ vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddps %ymm1, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm0
- cmpq %rsi, %rcx
+ movq %rax, %rcx
+ cmpq %rsi, %rax
je .LBB1_13
-# BB#11: # %.lr.ph.preheader
+.LBB1_2: # %.lr.ph.preheader13
leaq (%rdi,%rcx,4), %rax
subq %rcx, %rsi
.align 16, 0x90
-.LBB1_12: # %.lr.ph
+.LBB1_3: # %.lr.ph
# =>This Inner Loop Header: Depth=1
vaddss (%rax), %xmm0, %xmm0
addq $4, %rax
addq $-1, %rsi
- jne .LBB1_12
+ jne .LBB1_3
.LBB1_13: # %._crit_edge
#APP
#NO_APP
```</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>