[llvm-bugs] [Bug 44460] New: Missed optimization : Loop unrolling causes inefficient use of `adc` as compared to loop rolling

Sun Jan 5 00:01:49 PST 2020

https://bugs.llvm.org/show_bug.cgi?id=44460

            Bug ID: 44460
           Summary: Missed optimization : Loop unrolling causes
                    inefficient use of `adc` as compared to loop rolling
           Product: new-bugs
           Version: 9.0
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: new bugs
          Assignee: unassignedbugs at nondot.org
          Reporter: madhur4127 at gmail.com
                CC: htmldeveloper at gmail.com, llvm-bugs at lists.llvm.org

Consider emulating 192-bit integer using a 128-bit integer and a 64-bit
integer. In the code sample this emulated integer is used to compute dot
product of two uint64_t vectors of length N. 

    // function to compute dot product of two vectors
    using u128 = unsigned __int128;
    const int N = 2048;
    uint64_t a[N], b[N];
    u128 sum = 0;
    uint64_t overflow = 0;
    for(int i=0;i<N;++i){
        u128 prod = (u128) a[i] * (u128) b[i];
        sum += prod;
        // gcc branches, clang just uses: adc overflow, 0
        overflow += sum<prod;
    }

To check for overflow in 128-bit and subsequently propagate the carry to
`overflow`, `adc` can be used. This idiom works well when loops are rolled
(no-unroll).

clang++ -O3 -Wall -Wextra -march=broadwell -fno-unroll-loops

.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        mov     rax, qword ptr [rsi + 8*rcx]
        mul     qword ptr [rdi + 8*rcx]
        add     r10, rax
        adc     r9, rdx

        adc     r11, 0                  # This is efficient form

        inc     rcx
        cmp     rcx, 2048
        jne     .LBB0_1
        mov     qword ptr [r8], r11
        mov     rax, r10
        mov     rdx, r9
        ret
------

But when loops are unrolled this efficient ASM degrades to `mov; setb; movzx;
add;` Instead of just `adc reg, 0`.

clang++ -O3 -Wall -Wextra -march=broadwell  # fno-unroll-loops is absent

.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        mov     rax, qword ptr [rsi + 8*rbx]
        mov     r10, qword ptr [rsi + 8*rbx + 8]
        mul     qword ptr [rdi + 8*rbx]
        mov     r11, rdx
        mov     r14, rax
        add     r14, r9
        adc     r11, rcx
        setb    bpl
        mov     rax, r10
        mul     qword ptr [rdi + 8*rbx + 8]
        mov     rcx, rax
        mov     r9, rdx
        movzx   ebp, bpl
        add     rcx, r14
        adc     r9, r11
        adc     rbp, r15
        mov     rax, qword ptr [rsi + 8*rbx + 16]
        mul     qword ptr [rdi + 8*rbx + 16]
        mov     r10, rdx
        mov     r11, rax
        add     r11, rcx
        adc     r10, r9
        setb    cl
        mov     rax, qword ptr [rsi + 8*rbx + 24]
        mul     qword ptr [rdi + 8*rbx + 24]
        movzx   r15d, cl
        mov     r9, rax
        add     r9, r11
        mov     rcx, rdx
        adc     rcx, r10
        adc     r15, rbp
        add     rbx, 4
        cmp     rbx, 2048
        jne     .LBB0_1

For complete source code, here is the godbolt link:
https://godbolt.org/z/tT7Z2H

Source of this discussion is the stackoverflow Q&A:
https://stackoverflow.com/q/59575408/8199790

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20200105/993437c6/attachment.html>