[llvm-bugs] [Bug 32132] New: The number of instructions inserted before/after a simple loop outweights the loop body
via llvm-bugs
llvm-bugs at lists.llvm.org
Fri Mar 3 10:58:16 PST 2017
https://bugs.llvm.org/show_bug.cgi?id=32132
Bug ID: 32132
Summary: The number of instructions inserted before/after a
simple loop outweights the loop body
Product: new-bugs
Version: 3.9
Hardware: PC
OS: All
Status: NEW
Severity: enhancement
Priority: P
Component: new bugs
Assignee: unassignedbugs at nondot.org
Reporter: kobalicek.petr at gmail.com
CC: llvm-bugs at lists.llvm.org
CLang tries hard to optimize loops, but in some cases the number of
instructions inserted before/after a loop outweights the loop body. The sample
code for quick play and comparison is available here:
https://godbolt.org/g/utnuxK
Sample Code
-----------
#include <stdint.h>
#if defined(_MSC_VER)
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
// A function that processes points (x|y), passed as doubles.
void transform(double* dst, const double* src, const double* matrix, size_t
length) {
intptr_t i = static_cast<intptr_t>(length);
while ((i -= 8) >= 0) {
__m256d s0 = _mm256_loadu_pd(src + 0);
__m256d s1 = _mm256_loadu_pd(src + 4);
__m256d s2 = _mm256_loadu_pd(src + 8);
__m256d s3 = _mm256_loadu_pd(src + 12);
_mm256_storeu_pd(dst + 0, _mm256_add_pd(s0, s0));
_mm256_storeu_pd(dst + 4, _mm256_add_pd(s1, s1));
_mm256_storeu_pd(dst + 8, _mm256_add_pd(s2, s2));
_mm256_storeu_pd(dst + 12, _mm256_add_pd(s3, s3));
dst += 16;
src += 16;
}
i += 8;
while ((i -= 2) >= 0) {
__m256d s0 = _mm256_loadu_pd(src);
_mm256_storeu_pd(dst, _mm256_add_pd(s0, s0));
dst += 4;
src += 4;
}
if (i & 1) {
__m128d s0 = _mm_loadu_pd(src);
_mm_storeu_pd(dst, _mm_add_pd(s0, s0));
}
}
Compiled by clang (-O2 -mavx -fno-exceptions):
-----------------
transform(double*, double const*, double const*, unsigned long):
; --- Ridiculous code ---
mov r10, rcx
add r10, -8
js .LBB0_7
mov r11, r10
shr r11, 3
mov r8, r11
shl r8, 4
lea r9d, [r11 + 1]
and r9d, 1
mov rdx, rdi
mov rax, rsi
test r11, r11
je .LBB0_4
lea rcx, [r9 - 1]
sub rcx, r11
mov rdx, rdi
mov rax, rsi
; -----------------------
.LBB0_3: # =>This Inner Loop Header: Depth=1
vmovupd ymm0, ymmword ptr [rax]
vmovupd ymm1, ymmword ptr [rax + 32]
vmovupd ymm2, ymmword ptr [rax + 64]
vmovupd ymm3, ymmword ptr [rax + 96]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rdx], ymm0
vaddpd ymm0, ymm1, ymm1
vmovupd ymmword ptr [rdx + 32], ymm0
vaddpd ymm0, ymm2, ymm2
vmovupd ymmword ptr [rdx + 64], ymm0
vaddpd ymm0, ymm3, ymm3
vmovupd ymmword ptr [rdx + 96], ymm0
vmovupd ymm0, ymmword ptr [rax + 128]
vmovupd ymm1, ymmword ptr [rax + 160]
vmovupd ymm2, ymmword ptr [rax + 192]
vmovupd ymm3, ymmword ptr [rax + 224]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rdx + 128], ymm0
vaddpd ymm0, ymm1, ymm1
vmovupd ymmword ptr [rdx + 160], ymm0
vaddpd ymm0, ymm2, ymm2
vmovupd ymmword ptr [rdx + 192], ymm0
vaddpd ymm0, ymm3, ymm3
vmovupd ymmword ptr [rdx + 224], ymm0
add rdx, 256
add rax, 256
; --- Instead of using sub/jns it uses add/jne ---
add rcx, 2
jne .LBB0_3
; ------------------------------------------------
; --- CLANG Unrolled the tail loop! ---
.LBB0_4:
shl r11, 3
lea rcx, [r8 + 16]
test r9, r9
je .LBB0_6
vmovupd ymm0, ymmword ptr [rax]
vmovupd ymm1, ymmword ptr [rax + 32]
vmovupd ymm2, ymmword ptr [rax + 64]
vmovupd ymm3, ymmword ptr [rax + 96]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rdx], ymm0
vaddpd ymm0, ymm1, ymm1
vmovupd ymmword ptr [rdx + 32], ymm0
vaddpd ymm0, ymm2, ymm2
vmovupd ymmword ptr [rdx + 64], ymm0
vaddpd ymm0, ymm3, ymm3
vmovupd ymmword ptr [rdx + 96], ymm0
.LBB0_6:
; --- Ridiculous code ---
lea rdi, [rdi + 8*r8 + 128]
sub r10, r11
lea rsi, [rsi + 8*rcx]
mov rcx, r10
; -----------------------
.LBB0_7:
; --- Ridiculous code ---
mov r10, rcx
add r10, -2
js .LBB0_15
mov r8, r10
shr r8
lea r9d, [r8 + 1]
and r9d, 3
mov rax, rdi
mov rdx, rsi
cmp r10, 6
jb .LBB0_11
lea r10, [r9 - 1]
sub r10, r8
mov rax, rdi
mov rdx, rsi
; -----------------------
.LBB0_10: # =>This Inner Loop Header: Depth=1
vmovupd ymm0, ymmword ptr [rdx]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rax], ymm0
vmovupd ymm0, ymmword ptr [rdx + 32]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rax + 32], ymm0
vmovupd ymm0, ymmword ptr [rdx + 64]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rax + 64], ymm0
vmovupd ymm0, ymmword ptr [rdx + 96]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rax + 96], ymm0
sub rax, -128
sub rdx, -128
add r10, 4
jne .LBB0_10
.LBB0_11:
lea r10, [4*r8 + 4]
lea r11, [4*r8]
add r8, r8
test r9, r9
je .LBB0_14
neg r9
.LBB0_13: # =>This Inner Loop Header: Depth=1
vmovupd ymm0, ymmword ptr [rdx]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rax], ymm0
add rax, 32
add rdx, 32
inc r9
jne .LBB0_13
.LBB0_14:
lea rsi, [rsi + 8*r11 + 32]
lea rdi, [rdi + 8*r10]
add rcx, -4
sub rcx, r8
mov r10, rcx
; --- End of the unrolled loop ---
.LBB0_15:
test r10b, 1
je .LBB0_17
vmovupd xmm0, xmmword ptr [rsi]
vaddpd xmm0, xmm0, xmm0
vmovupd xmmword ptr [rdi], xmm0
.LBB0_17:
vzeroupper
ret
I tried to play with compiler options, the best I could get is using -Oz, but
still, the code is really sub-optimal:
transform(double*, double const*, double const*, unsigned long):
# @transform(double*, double const*, double const*, unsigned long)
push 7
pop rax
sub rax, rcx
mov rdx, rcx
shr rdx, 3
xor r10d, r10d
test rax, rax
cmovle r10, rdx
mov r9, r10
shl r9, 4
lea r8, [8*r10]
shl r10, 7
add r10, rsi
mov rdx, rcx
mov rax, rdi
jmp .LBB0_1
.LBB0_8: # in Loop: Header=BB0_1 Depth=1
vmovupd ymm0, ymmword ptr [rsi]
vmovupd ymm1, ymmword ptr [rsi + 32]
vmovupd ymm2, ymmword ptr [rsi + 64]
vmovupd ymm3, ymmword ptr [rsi + 96]
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rax], ymm0
vaddpd ymm0, ymm1, ymm1
vmovupd ymmword ptr [rax + 32], ymm0
vaddpd ymm0, ymm2, ymm2
vmovupd ymmword ptr [rax + 64], ymm0
vaddpd ymm0, ymm3, ymm3
vmovupd ymmword ptr [rax + 96], ymm0
sub rsi, -128
sub rax, -128
.LBB0_1: # =>This Inner Loop Header: Depth=1
add rdx, -8
jns .LBB0_8
sub rcx, r8
lea r8, [rdi + 8*r9]
push 1
pop rax
sub rax, rcx
lea rdx, [rcx + rcx]
and rdx, -4
xor esi, esi
test rax, rax
cmovle rsi, rdx
mov rax, rcx
mov rdi, r10
mov rdx, r8
jmp .LBB0_3
.LBB0_4: # in Loop: Header=BB0_3 Depth=1
vmovupd ymm0, ymmword ptr [rdi]
add rdi, 32
vaddpd ymm0, ymm0, ymm0
vmovupd ymmword ptr [rdx], ymm0
add rdx, 32
.LBB0_3: # =>This Inner Loop Header: Depth=1
add rax, -2
jns .LBB0_4
test cl, 1
je .LBB0_7
vmovupd xmm0, xmmword ptr [r10 + 8*rsi]
vaddpd xmm0, xmm0, xmm0
vmovupd xmmword ptr [r8 + 8*rsi], xmm0
.LBB0_7:
vzeroupper
ret
The problem
-----------
It seems that clang doesn't want to just accept what was in the C++ source. In
this case a straightforward 1:1 translation would result in athe best possible
code. For example here is the same function compiled by ICC:
transform(double*, double const*, double const*, unsigned long):
add rcx, -8 #11.11 FOLLOWS
C++ code!
js ..B1.5 # Prob 2% #11.22
..B1.3: # Preds ..B1.1 ..B1.3
vmovupd xmm0, XMMWORD PTR [rsi] #12.34
vmovupd xmm1, XMMWORD PTR [32+rsi] #13.34
vmovupd xmm2, XMMWORD PTR [64+rsi] #14.34
vmovupd xmm3, XMMWORD PTR [96+rsi] #15.34
vinsertf128 ymm6, ymm1, XMMWORD PTR [48+rsi], 1 #13.34
vinsertf128 ymm4, ymm0, XMMWORD PTR [16+rsi], 1 #12.34
vinsertf128 ymm8, ymm2, XMMWORD PTR [80+rsi], 1 #14.34
vinsertf128 ymm10, ymm3, XMMWORD PTR [112+rsi], 1 #15.34
add rsi, 128 #23.5
vaddpd ymm5, ymm4, ymm4 #17.32
vaddpd ymm7, ymm6, ymm6 #18.32
vaddpd ymm9, ymm8, ymm8 #19.32
vaddpd ymm11, ymm10, ymm10 #20.32
vmovupd XMMWORD PTR [rdi], xmm5 #17.22
vmovupd XMMWORD PTR [32+rdi], xmm7 #18.22
vmovupd XMMWORD PTR [64+rdi], xmm9 #19.22
vmovupd XMMWORD PTR [96+rdi], xmm11 #20.22
vextractf128 XMMWORD PTR [16+rdi], ymm5, 1 #17.22
vextractf128 XMMWORD PTR [48+rdi], ymm7, 1 #18.22
vextractf128 XMMWORD PTR [80+rdi], ymm9, 1 #19.22
vextractf128 XMMWORD PTR [112+rdi], ymm11, 1 #20.22
add rdi, 128 #22.5
add rcx, -8 #11.11
jns ..B1.3 # Prob 82% #11.22 FOLLOWS
C++ code!
..B1.5: # Preds ..B1.3 ..B1.1
add rcx, 6 #25.3
js ..B1.9 # Prob 2% #27.22 FOLLOWS
C++ code!
..B1.7: # Preds ..B1.5 ..B1.7
vmovupd xmm0, XMMWORD PTR [rsi] #28.34
vinsertf128 ymm1, ymm0, XMMWORD PTR [16+rsi], 1 #28.34
add rsi, 32 #32.5
vaddpd ymm2, ymm1, ymm1 #29.27
vmovupd XMMWORD PTR [rdi], xmm2 #29.22
vextractf128 XMMWORD PTR [16+rdi], ymm2, 1 #29.22
add rdi, 32 #31.5
add rcx, -2 #27.11 FOLLOWS
C++ code!
jns ..B1.7 # Prob 82% #27.22
..B1.9: # Preds ..B1.7 ..B1.5
test rcx, 1 #35.11
je ..B1.11 # Prob 60% #35.11
vmovupd xmm0, XMMWORD PTR [rsi] #36.31
vaddpd xmm1, xmm0, xmm0 #37.24
vmovupd XMMWORD PTR [rdi], xmm1 #37.19
..B1.11: # Preds ..B1.9 ..B1.10
vzeroupper #39.1
ret #39.1
This problem actually affects my own projects as the code surrounding loops can
consume more cycles than loops themselves if the number of iterations is
relatively small.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170303/b59c54d0/attachment-0001.html>
More information about the llvm-bugs
mailing list