[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops
Hal Finkel
hfinkel at anl.gov
Tue Jul 22 21:00:44 PDT 2014
Hi Dennis,
Can you please file a bug for this at http://llvm.org/bugs/ -- we should not be vectorizing this loop of length 3.
-Hal
----- Original Message -----
> From: "Dennis Luehring" <dl.soluz at gmx.net>
> To: llvmdev at cs.uiuc.edu
> Sent: Tuesday, July 22, 2014 10:51:54 PM
> Subject: [LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops
>
> the clang 3.5 loop optimizer seems to jump in unintentional for
> simple loops
>
> the very simple example
>
> ----
> const int SIZE = 3;
>
> int the_func(int* p_array)
> {
> int dummy = 0;
> #if defined(ITER)
> for(int* p = &p_array[0]; p < &p_array[SIZE]; ++p) dummy += *p;
> #else
> for(int i = 0; i < SIZE; ++i) dummy += p_array[i];
> #endif
> return dummy;
> }
>
> int main(int argc, char** argv)
> {
> int* array = new int[SIZE];
> for(int i = 0; i < SIZE; ++i){ array[i] = *argv[i]; }
> int dummy = the_func(array);
> delete[] array;
> return dummy;
> }
> ----
>
> compiled with gcc 4.9.1 and clang 3.5
>
> with clang3.5 + #define ITER the_func contains masses of code
> the code in main is also sometimes different (not just inlined) to
> the_func
>
> clang -DITER -O2
> clang -DITER -O3
>
> gives:
>
> the_func:
> leaq 12(%rdi), %rcx
> leaq 4(%rdi), %rax
> cmpq %rax, %rcx
> cmovaq %rcx, %rax
> movq %rdi, %rsi
> notq %rsi
> addq %rax, %rsi
> shrq $2, %rsi
> incq %rsi
> xorl %edx, %edx
> movabsq $9223372036854775800, %rax # imm =
> 0x7FFFFFFFFFFFFFF8
> andq %rsi, %rax
> pxor %xmm0, %xmm0
> je .LBB0_1
> # BB#2: # %vector.body.preheader
> leaq (%rdi,%rax,4), %r8
> addq $16, %rdi
> movq %rsi, %rdx
> andq $-8, %rdx
> pxor %xmm0, %xmm0
> pxor %xmm1, %xmm1
> .align 16, 0x90
> .LBB0_3: # %vector.body
> # =>This Inner Loop Header:
> Depth=1
> movdqa %xmm1, %xmm2
> movdqa %xmm0, %xmm3
> movdqu -16(%rdi), %xmm0
> movdqu (%rdi), %xmm1
> paddd %xmm3, %xmm0
> paddd %xmm2, %xmm1
> addq $32, %rdi
> addq $-8, %rdx
> jne .LBB0_3
> # BB#4:
> movq %r8, %rdi
> movq %rax, %rdx
> jmp .LBB0_5
> .LBB0_1:
> pxor %xmm1, %xmm1
> .LBB0_5: # %middle.block
> paddd %xmm1, %xmm0
> movdqa %xmm0, %xmm1
> movhlps %xmm1, %xmm1 # xmm1 = xmm1[1,1]
> paddd %xmm0, %xmm1
> pshufd $1, %xmm1, %xmm0 # xmm0 = xmm1[1,0,0,0]
> paddd %xmm1, %xmm0
> movd %xmm0, %eax
> cmpq %rdx, %rsi
> je .LBB0_7
> .align 16, 0x90
> .LBB0_6: # %scalar.ph
> # =>This Inner Loop Header:
> Depth=1
> addl (%rdi), %eax
> addq $4, %rdi
> cmpq %rcx, %rdi
> jb .LBB0_6
> .LBB0_7: # %._crit_edge
> retq
>
> isn't that a little bit too long?
>
> other better looking results:
>
> clang -O2
> clang -O3
> gcc -O3
> gcc -DITER -O3
>
> gives:
>
> the_func:
> movl 4(%rdi), %eax
> addl (%rdi), %eax
> addl 8(%rdi), %eax
> ret(q)
>
> looks good
>
> gcc -DITER -O2
>
> gives:
>
> the_func:
> leaq 12(%rdi), %rdx
> xorl %eax, %eax
> .L2:
> addl (%rdi), %eax
> addq $4, %rdi
> cmpq %rdx, %rdi
> jne .L2
> rep ret
>
> looks good
>
>
> gcc4.9.1 seems to be more "stable" in its optimization for the_func
> and main
>
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
--
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory
More information about the llvm-dev
mailing list