[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

Dennis Luehring dl.soluz at gmx.net
Tue Jul 22 20:51:54 PDT 2014


the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

the very simple example

----
const int SIZE = 3;

int the_func(int* p_array)
{
    int dummy = 0;
#if defined(ITER)
    for(int* p = &p_array[0]; p < &p_array[SIZE]; ++p) dummy += *p;
#else
    for(int i = 0; i < SIZE; ++i) dummy += p_array[i];
#endif
    return dummy;
}

int main(int argc, char** argv)
{
    int* array = new int[SIZE];
    for(int i = 0; i < SIZE; ++i){ array[i] = *argv[i]; }
    int dummy = the_func(array);
    delete[] array;
    return dummy;
}
----

compiled with gcc 4.9.1 and clang 3.5

with clang3.5 + #define ITER the_func contains masses of code
the code in main is also sometimes different (not just inlined) to the_func

clang -DITER -O2
clang -DITER -O3

gives:

the_func:
      leaq    12(%rdi), %rcx
      leaq    4(%rdi), %rax
      cmpq    %rax, %rcx
      cmovaq    %rcx, %rax
      movq    %rdi, %rsi
      notq    %rsi
      addq    %rax, %rsi
      shrq    $2, %rsi
      incq    %rsi
      xorl    %edx, %edx
      movabsq    $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
      andq    %rsi, %rax
      pxor    %xmm0, %xmm0
      je    .LBB0_1
# BB#2:                                 # %vector.body.preheader
      leaq    (%rdi,%rax,4), %r8
      addq    $16, %rdi
      movq    %rsi, %rdx
      andq    $-8, %rdx
      pxor    %xmm0, %xmm0
      pxor    %xmm1, %xmm1
      .align    16, 0x90
.LBB0_3:                                # %vector.body
                                          # =>This Inner Loop Header: Depth=1
      movdqa    %xmm1, %xmm2
      movdqa    %xmm0, %xmm3
      movdqu    -16(%rdi), %xmm0
      movdqu    (%rdi), %xmm1
      paddd    %xmm3, %xmm0
      paddd    %xmm2, %xmm1
      addq    $32, %rdi
      addq    $-8, %rdx
      jne    .LBB0_3
# BB#4:
      movq    %r8, %rdi
      movq    %rax, %rdx
      jmp    .LBB0_5
.LBB0_1:
      pxor    %xmm1, %xmm1
.LBB0_5:                                # %middle.block
      paddd    %xmm1, %xmm0
      movdqa    %xmm0, %xmm1
      movhlps    %xmm1, %xmm1            # xmm1 = xmm1[1,1]
      paddd    %xmm0, %xmm1
      pshufd    $1, %xmm1, %xmm0        # xmm0 = xmm1[1,0,0,0]
      paddd    %xmm1, %xmm0
      movd    %xmm0, %eax
      cmpq    %rdx, %rsi
      je    .LBB0_7
      .align    16, 0x90
.LBB0_6:                                # %scalar.ph
                                          # =>This Inner Loop Header: Depth=1
      addl    (%rdi), %eax
      addq    $4, %rdi
      cmpq    %rcx, %rdi
      jb    .LBB0_6
.LBB0_7:                                # %._crit_edge
      retq

isn't that a little bit too long?

other better looking results:

clang -O2
clang -O3
gcc -O3
gcc -DITER -O3

gives:

the_func:
      movl    4(%rdi), %eax
      addl    (%rdi), %eax
      addl    8(%rdi), %eax
      ret(q)

looks good

gcc -DITER -O2

gives:

the_func:
      leaq    12(%rdi), %rdx
      xorl    %eax, %eax
.L2:
      addl    (%rdi), %eax
      addq    $4, %rdi
      cmpq    %rdx, %rdi
      jne    .L2
      rep ret

looks good


gcc4.9.1 seems to be more "stable" in its optimization for the_func and main




More information about the llvm-dev mailing list