[llvm-bugs] [Bug 33757] New: GCC produces much better code for simple double loops than Clang

Wed Jul 12 02:37:44 PDT 2017

https://bugs.llvm.org/show_bug.cgi?id=33757

            Bug ID: 33757
           Summary: GCC produces much better code for simple double loops
                    than Clang
           Product: libraries
           Version: trunk
          Hardware: All
                OS: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Loop Optimizer
          Assignee: unassignedbugs at nondot.org
          Reporter: gonzalobg88 at gmail.com
                CC: llvm-bugs at lists.llvm.org

See here the code and asm comparison, code and assembly follow below:

https://gcc.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(fontScale:1.1,j:1,source:'%23include+%3Cstdint.h%3E%0A%0Aint64_t+c_iter(int64_t+high)%0A%7B+++++++%0A++int64_t+total+%3D+0%3B%0A++int64_t+i%3B%0A++for+(i+%3D+1%3B+i+%3C%3D+high%3B+i%2B%2B)+%7B%0A++++while+(i+%25+2+!!%3D+0)+%7B%0A++++++i%2B%2B%3B%0A++++%7D%0A++++total+%2B%3D+i+*+2%3B%0A++%7D%0A++return+total%3B%0A%7D%0A%0Aint64_t+c_loop(int64_t+high)+%7B%0A++int64_t+total+%3D+0%3B%0A++int64_t+i%3B%0A++for+(i+%3D+1%3B+i+%3C%3D+high%3B+%2B%2Bi)+%7B%0A++++if+(i+%25+2+%3D%3D+0)+%7B%0A++++++total+%2B%3D+i+*+2%3B%0A++++%7D%0A++%7D%0A++return+total%3B%0A%7D'),l:'5',n:'0',o:'C%2B%2B+source+%231',t:'0')),k:33.03381678958197,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:clang400,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-O2+-fno-unroll-loops',source:1),l:'5',n:'0',o:'x86-64+clang+4.0.0+(Editor+%231,+Compiler+%231)',t:'0')),k:33.632849877084716,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:g71,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-O2',source:1),l:'5',n:'0',o:'x86-64+gcc+7.1+(Editor+%231,+Compiler+%232)',t:'0')),k:33.33333333333333,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4

Code:

#include <stdint.h>

int64_t c_iter(int64_t high)
{       
  int64_t total = 0;
  int64_t i;
  for (i = 1; i <= high; i++) {
    while (i % 2 != 0) {
      i++;
    }
    total += i * 2;
  }
  return total;
}

int64_t c_loop(int64_t high) {
  int64_t total = 0;
  int64_t i;
  for (i = 1; i <= high; ++i) {
    if (i % 2 == 0) {
      total += i * 2;
    }
  }
  return total;
}

GCC assembly:

c_iter(long):
        test    rdi, rdi
        jle     .L4
        sub     rdi, 1
        mov     edx, 4
        xor     eax, eax
        shr     rdi
        lea     rcx, [8+rdi*4]
.L3:
        add     rax, rdx
        add     rdx, 4
        cmp     rcx, rdx
        jne     .L3
        rep ret
.L4:
        xor     eax, eax
        ret
c_loop(long):
        test    rdi, rdi
        jle     .L12
        add     rdi, 1
        mov     edx, 1
        xor     eax, eax
        jmp     .L9
.L11:
        lea     rcx, [rax+rdx*2]
        test    dl, 1
        cmove   rax, rcx
.L9:
        add     rdx, 1
        cmp     rdx, rdi
        jne     .L11
        rep ret
.L12:
        xor     eax, eax
        ret

Clang assembly -O2 -funroll-loops:

c_iter(long):                             # @c_iter(long)
        xor     eax, eax
        test    rdi, rdi
        jle     .LBB0_5
        mov     ecx, 1
.LBB0_2:                                # =>This Loop Header: Depth=1
        lea     rax, [rax + 2*rcx - 2]
.LBB0_3:                                #   Parent Loop BB0_2 Depth=1
        mov     edx, ecx
        inc     rcx
        add     rax, 2
        test    dl, 1
        jne     .LBB0_3
        lea     rdx, [rcx - 1]
        cmp     rdx, rdi
        jl      .LBB0_2
.LBB0_5:
        ret

c_loop(long):                             # @c_loop(long)
        test    rdi, rdi
        jle     .LBB1_1
        lea     r8, [rdi - 1]
        mov     r9, rdi
        and     r9, 3
        je      .LBB1_3
        xor     edx, edx
        mov     ecx, 2
        xor     eax, eax
.LBB1_5:                                # =>This Inner Loop Header: Depth=1
        inc     rdx
        test    dl, 1
        mov     esi, 0
        cmove   rsi, rcx
        add     rax, rsi
        add     rcx, 2
        cmp     r9, rdx
        jne     .LBB1_5
        inc     rdx
        cmp     r8, 3
        jae     .LBB1_8
        jmp     .LBB1_10
.LBB1_1:
        xor     eax, eax
        ret
.LBB1_3:
        xor     eax, eax
        mov     edx, 1
        cmp     r8, 3
        jb      .LBB1_10
.LBB1_8:
        neg     rdi
        lea     rcx, [rdx + rdx + 6]
        lea     rdx, [rdx + 3]
        xor     r8d, r8d
.LBB1_9:                                # =>This Inner Loop Header: Depth=1
        lea     r9d, [rdx - 3]
        lea     rsi, [rcx - 6]
        test    r9b, 1
        cmovne  rsi, r8
        add     rsi, rax
        lea     r9d, [rdx - 2]
        lea     rax, [rcx - 4]
        test    r9b, 1
        cmovne  rax, r8
        add     rax, rsi
        lea     r9d, [rdx - 1]
        lea     rsi, [rcx - 2]
        test    r9b, 1
        cmovne  rsi, r8
        add     rsi, rax
        test    dl, 1
        mov     eax, 0
        cmove   rax, rcx
        add     rax, rsi
        lea     rsi, [rdi + rdx + 4]
        add     rdx, 4
        add     rcx, 8
        cmp     rsi, 4
        jne     .LBB1_9
.LBB1_10:
        ret

Clang assembly: -O2 -fno-unroll-loops

c_iter(long):                             # @c_iter(long)
        xor     eax, eax
        test    rdi, rdi
        jle     .LBB0_5
        mov     ecx, 1
.LBB0_2:                                # =>This Loop Header: Depth=1
        lea     rax, [rax + 2*rcx - 2]
.LBB0_3:                                #   Parent Loop BB0_2 Depth=1
        mov     edx, ecx
        inc     rcx
        add     rax, 2
        test    dl, 1
        jne     .LBB0_3
        lea     rdx, [rcx - 1]
        cmp     rdx, rdi
        jl      .LBB0_2
.LBB0_5:
        ret

c_loop(long):                             # @c_loop(long)
        test    rdi, rdi
        jle     .LBB1_1
        xor     ecx, ecx
        mov     edx, 2
        xor     eax, eax
.LBB1_3:                                # =>This Inner Loop Header: Depth=1
        inc     rcx
        test    cl, 1
        mov     esi, 0
        cmove   rsi, rdx
        add     rax, rsi
        add     rdx, 2
        cmp     rdi, rcx
        jne     .LBB1_3
        ret
.LBB1_1:
        xor     eax, eax
        ret

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20170712/5b2f7652/attachment.html>