[llvm-bugs] [Bug 44460] New: Missed optimization : Loop unrolling causes inefficient use of `adc` as compared to loop rolling
via llvm-bugs
llvm-bugs at lists.llvm.org
Sun Jan 5 00:01:49 PST 2020
https://bugs.llvm.org/show_bug.cgi?id=44460
Bug ID: 44460
Summary: Missed optimization : Loop unrolling causes
inefficient use of `adc` as compared to loop rolling
Product: new-bugs
Version: 9.0
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: new bugs
Assignee: unassignedbugs at nondot.org
Reporter: madhur4127 at gmail.com
CC: htmldeveloper at gmail.com, llvm-bugs at lists.llvm.org
Consider emulating 192-bit integer using a 128-bit integer and a 64-bit
integer. In the code sample this emulated integer is used to compute dot
product of two uint64_t vectors of length N.
// function to compute dot product of two vectors
using u128 = unsigned __int128;
const int N = 2048;
uint64_t a[N], b[N];
u128 sum = 0;
uint64_t overflow = 0;
for(int i=0;i<N;++i){
u128 prod = (u128) a[i] * (u128) b[i];
sum += prod;
// gcc branches, clang just uses: adc overflow, 0
overflow += sum<prod;
}
To check for overflow in 128-bit and subsequently propagate the carry to
`overflow`, `adc` can be used. This idiom works well when loops are rolled
(no-unroll).
clang++ -O3 -Wall -Wextra -march=broadwell -fno-unroll-loops
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov rax, qword ptr [rsi + 8*rcx]
mul qword ptr [rdi + 8*rcx]
add r10, rax
adc r9, rdx
adc r11, 0 # This is efficient form
inc rcx
cmp rcx, 2048
jne .LBB0_1
mov qword ptr [r8], r11
mov rax, r10
mov rdx, r9
ret
------
But when loops are unrolled this efficient ASM degrades to `mov; setb; movzx;
add;` Instead of just `adc reg, 0`.
clang++ -O3 -Wall -Wextra -march=broadwell # fno-unroll-loops is absent
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov rax, qword ptr [rsi + 8*rbx]
mov r10, qword ptr [rsi + 8*rbx + 8]
mul qword ptr [rdi + 8*rbx]
mov r11, rdx
mov r14, rax
add r14, r9
adc r11, rcx
setb bpl
mov rax, r10
mul qword ptr [rdi + 8*rbx + 8]
mov rcx, rax
mov r9, rdx
movzx ebp, bpl
add rcx, r14
adc r9, r11
adc rbp, r15
mov rax, qword ptr [rsi + 8*rbx + 16]
mul qword ptr [rdi + 8*rbx + 16]
mov r10, rdx
mov r11, rax
add r11, rcx
adc r10, r9
setb cl
mov rax, qword ptr [rsi + 8*rbx + 24]
mul qword ptr [rdi + 8*rbx + 24]
movzx r15d, cl
mov r9, rax
add r9, r11
mov rcx, rdx
adc rcx, r10
adc r15, rbp
add rbx, 4
cmp rbx, 2048
jne .LBB0_1
For complete source code, here is the godbolt link:
https://godbolt.org/z/tT7Z2H
Source of this discussion is the stackoverflow Q&A:
https://stackoverflow.com/q/59575408/8199790
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20200105/993437c6/attachment.html>
More information about the llvm-bugs
mailing list