[llvm-bugs] [Bug 28002] New: Performance regression summing small float array
via llvm-bugs
llvm-bugs at lists.llvm.org
Sat Jun 4 10:42:55 PDT 2016
https://llvm.org/bugs/show_bug.cgi?id=28002
Bug ID: 28002
Summary: Performance regression summing small float array
Product: libraries
Version: 3.8
Hardware: PC
OS: Linux
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
Assignee: unassignedbugs at nondot.org
Reporter: yyc1992 at gmail.com
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
The following loop takes 20-60% (depending on the machine) longer to run when
compiled with clang 3.8+ compare to 3.7.1 (optimization level `-Ofast`,
`-march=core-avx2`) for a cacheline aligned array of 1024 floats. When the
array is much larger so that it doesn't fit in the cache and the loop is memory
bandwidth limited there's not performance difference anymore. (FWIW, somehow
both are much faster than GCC 6.1...)
```c
__attribute__((noinline)) float sum32(float *a, size_t n)
{
/* a = (float*)__builtin_assume_aligned(a, 64); */
float s = 0;
for (size_t i = 0;i < n;i++)
s += a[i];
return s;
}
```
The C code, llvm ir and assembly output on 3.7 and 3.8 are available in [this
gist](https://gist.github.com/yuyichao/5b07f71c1f19248ec5511d758532a4b0) The
difference in assembly is also pasted below.
```diff
--- llvm37.s 2016-06-04 13:23:34.947819989 -0400
+++ llvm38.s 2016-06-04 13:14:25.455283889 -0400
@@ -4,103 +4,95 @@
vxorps %xmm0, %xmm0, %xmm0
testq %rsi, %rsi
je .LBB1_13
-# BB#1: # %overflow.checked
+# BB#1: # %.lr.ph.preheader
+ vxorps %xmm0, %xmm0, %xmm0
+ xorl %ecx, %ecx
+ cmpq $15, %rsi
+ jbe .LBB1_2
+# BB#4: # %min.iters.checked
xorl %ecx, %ecx
movq %rsi, %rax
- vxorps %ymm0, %ymm0, %ymm0
- vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
- andq $-32, %rax
- je .LBB1_10
-# BB#2: # %vector.body.preheader
- leaq -32(%rsi), %r8
+ andq $-16, %rax
+ je .LBB1_2
+# BB#5: # %vector.body.preheader
+ leaq -16(%rsi), %r8
movl %r8d, %ecx
- shrl $5, %ecx
+ shrl $4, %ecx
addl $1, %ecx
xorl %edx, %edx
- testb $3, %cl
- je .LBB1_3
-# BB#4: # %vector.body.prol.preheader
- leal -32(%rsi), %ecx
- shrl $5, %ecx
+ testb $7, %cl
+ je .LBB1_6
+# BB#7: # %vector.body.prol.preheader
+ leal -16(%rsi), %ecx
+ shrl $4, %ecx
addl $1, %ecx
- andl $3, %ecx
+ andl $7, %ecx
negq %rcx
vxorps %ymm0, %ymm0, %ymm0
xorl %edx, %edx
vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
.align 16, 0x90
-.LBB1_5: # %vector.body.prol
+.LBB1_8: # %vector.body.prol
# =>This Inner Loop Header: Depth=1
vaddps (%rdi,%rdx,4), %ymm0, %ymm0
vaddps 32(%rdi,%rdx,4), %ymm1, %ymm1
- vaddps 64(%rdi,%rdx,4), %ymm2, %ymm2
- vaddps 96(%rdi,%rdx,4), %ymm3, %ymm3
- addq $32, %rdx
+ addq $16, %rdx
addq $1, %rcx
- jne .LBB1_5
- jmp .LBB1_6
-.LBB1_3:
+ jne .LBB1_8
+ jmp .LBB1_9
+.LBB1_6:
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
- vxorps %ymm2, %ymm2, %ymm2
- vxorps %ymm3, %ymm3, %ymm3
-.LBB1_6: # %vector.body.preheader.split
- cmpq $96, %r8
- jb .LBB1_9
-# BB#7: # %vector.body.preheader.split.split
+.LBB1_9: # %vector.body.preheader.split
+ cmpq $112, %r8
+ jb .LBB1_12
+# BB#10: # %vector.body.preheader.split.split
movq %rsi, %rcx
- andq $-32, %rcx
+ andq $-16, %rcx
subq %rdx, %rcx
leaq 480(%rdi,%rdx,4), %rdx
.align 16, 0x90
-.LBB1_8: # %vector.body
+.LBB1_11: # %vector.body
# =>This Inner Loop Header: Depth=1
vaddps -480(%rdx), %ymm0, %ymm0
vaddps -448(%rdx), %ymm1, %ymm1
- vaddps -416(%rdx), %ymm2, %ymm2
- vaddps -384(%rdx), %ymm3, %ymm3
+ vaddps -416(%rdx), %ymm0, %ymm0
+ vaddps -384(%rdx), %ymm1, %ymm1
vaddps -352(%rdx), %ymm0, %ymm0
vaddps -320(%rdx), %ymm1, %ymm1
- vaddps -288(%rdx), %ymm2, %ymm2
- vaddps -256(%rdx), %ymm3, %ymm3
+ vaddps -288(%rdx), %ymm0, %ymm0
+ vaddps -256(%rdx), %ymm1, %ymm1
vaddps -224(%rdx), %ymm0, %ymm0
vaddps -192(%rdx), %ymm1, %ymm1
- vaddps -160(%rdx), %ymm2, %ymm2
- vaddps -128(%rdx), %ymm3, %ymm3
+ vaddps -160(%rdx), %ymm0, %ymm0
+ vaddps -128(%rdx), %ymm1, %ymm1
vaddps -96(%rdx), %ymm0, %ymm0
vaddps -64(%rdx), %ymm1, %ymm1
- vaddps -32(%rdx), %ymm2, %ymm2
- vaddps (%rdx), %ymm3, %ymm3
+ vaddps -32(%rdx), %ymm0, %ymm0
+ vaddps (%rdx), %ymm1, %ymm1
addq $512, %rdx # imm = 0x200
addq $-128, %rcx
- jne .LBB1_8
-.LBB1_9:
- movq %rax, %rcx
-.LBB1_10: # %middle.block
+ jne .LBB1_11
+.LBB1_12: # %middle.block
vaddps %ymm0, %ymm1, %ymm0
- vaddps %ymm0, %ymm2, %ymm0
- vaddps %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddps %ymm1, %ymm0, %ymm0
- vpermilpd $1, %ymm0, %ymm1 # ymm1 = ymm0[1,0,2,2]
+ vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddps %ymm1, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm0
- cmpq %rsi, %rcx
+ movq %rax, %rcx
+ cmpq %rsi, %rax
je .LBB1_13
-# BB#11: # %.lr.ph.preheader
+.LBB1_2: # %.lr.ph.preheader13
leaq (%rdi,%rcx,4), %rax
subq %rcx, %rsi
.align 16, 0x90
-.LBB1_12: # %.lr.ph
+.LBB1_3: # %.lr.ph
# =>This Inner Loop Header: Depth=1
vaddss (%rax), %xmm0, %xmm0
addq $4, %rax
addq $-1, %rsi
- jne .LBB1_12
+ jne .LBB1_3
.LBB1_13: # %._crit_edge
#APP
#NO_APP
```
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160604/c8620db0/attachment-0001.html>
More information about the llvm-bugs
mailing list