[cfe-dev] missing optimization opportunity for const std::vector compared to std::array

Fri Sep 20 05:49:03 PDT 2013

On 20.09.2013, at 13:33, Dennis Luehring <dl.soluz at gmx.net> wrote:

> i've written this small testprogram to test the gcc4.8.1 optimizer and found a optimization opportunity
> 
> for details see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58483
> 
> then i compared the gcc results to clang 3.3 and found that the optimization of llvm "seems" to be far aways from the gcc results in this case

What you're seeing in the std::array case is likely an artifact of running the loop vectorizer too early in LLVM 3.3. This is fixed in trunk. The std::vector case is trickier, I think the main problem here is that the optimizer wasn't able to see that the loop inside std::accumulate has a constant trip count.

- Ben

> --- test.cpp ---
> #include <vector>
> #include <numeric>
> #include <array>
> 
> static int calc(const std::array<int,3> p_ints, const int& p_init)
> //static int calc(const std::vector<int> p_ints, const int& p_init)
> {
>  return std::accumulate(p_ints.begin(), p_ints.end(), p_init);
> }
> 
> int main()
> {
>  const int result = calc({10,20,30},100);
>  return result;
> }
> 
> gcc-optimizer-result using std::array
> 
> main:
>    mov    eax, 160
>    ret
> 
> gcc-optimizer result using std::vector
> 
> main:
>    push    rbx
>    mov    edi, 12
>    call    operator new(unsigned long)
>    mov    rdx, QWORD PTR ._81[rip]
>    mov    rdi, rax
>    mov    QWORD PTR [rax], rdx
>    mov    eax, DWORD PTR ._81[rip+8]
>    mov    rsi, rdx
>    shr    rsi, 32
>    lea    ebx, [rsi+100+rdx]
>    add    ebx, eax
>    test    rdi, rdi
>    mov    DWORD PTR [rdi+8], eax
>    je    .L2
>    call    operator delete(void*)
> .L2:
>    mov    eax, ebx
>    pop    rbx
>    ret
> ._81:
>    .long    10
>    .long    20
>    .long    30
> 
> the clang 3.3 results for -O3 -march=native -std=c++11
> 
> using std::array
> 
> main:                                   # @main
>    movabsq    $85899345930, %rax      # imm = 0x140000000A
>    movq    %rax, -16(%rsp)
>    movl    $100, %esi
>    movl    $30, -8(%rsp)
>    xorl    %edx, %edx
>    leaq    -16(%rsp), %rcx
>    movb    $1, %al
>    testb    %al, %al
>    jne    .LBB0_1
>    movd    %esi, %xmm1
>    pxor    %xmm0, %xmm0
>    xorl    %eax, %eax
> .LBB0_3:                                # %vector.body.i.i
>    movdqu    (%rsp,%rax,4), %xmm2
>    paddd    %xmm2, %xmm0
>    movdqu    -16(%rsp,%rax,4), %xmm2
>    paddd    %xmm2, %xmm1
>    addq    $8, %rax
>    cmpq    %rax, %rdx
>    jne    .LBB0_3
>    jmp    .LBB0_4
> .LBB0_1:
>    pxor    %xmm0, %xmm0
>    movd    %esi, %xmm1
> .LBB0_4:                                # %middle.block.i.i
>    movl    $3, %esi
>    paddd    %xmm1, %xmm0
>    movdqa    %xmm0, %xmm1
>    movhlps    %xmm1, %xmm1            # xmm1 = xmm1[1,1]
>    paddd    %xmm0, %xmm1
>    phaddd    %xmm1, %xmm1
>    movd    %xmm1, %eax
>    cmpq    %rdx, %rsi
>    je    .LBB0_7
>    addq    $-12, %rcx
>    leaq    -16(%rsp), %rdx
> .LBB0_6:                                # %scalar.ph.i.i
>    addl    12(%rcx), %eax
>    addq    $4, %rcx
>    cmpq    %rcx, %rdx
>    jne    .LBB0_6
> .LBB0_7:                                # %_ZL4calcSt5arrayIiLm3EERKi.exit
>    ret
> 
> using std::vector
> 
> main:                                   # @main
>    pushq    %rbx
>    movl    $12, %edi
>    callq    operator new(unsigned long)
>    movabsq    $85899345930, %rcx      # imm = 0x140000000A
>    movq    %rcx, (%rax)
>    xorl    %ecx, %ecx
>    movl    $3, %edx
>    movl    $100, %esi
>    movl    $30, 8(%rax)
>    movb    $1, %bl
>    movd    %esi, %xmm1
>    pxor    %xmm0, %xmm0
>    testb    %bl, %bl
>    jne    .LBB0_3
>    xorl    %esi, %esi
> .LBB0_2:                                # %vector.body.i.i
>    movdqu    16(%rax,%rsi,4), %xmm2
>    paddd    %xmm2, %xmm0
>    movdqu    (%rax,%rsi,4), %xmm2
>    paddd    %xmm2, %xmm1
>    addq    $8, %rsi
>    cmpq    %rsi, %rcx
>    jne    .LBB0_2
> .LBB0_3:                                # %middle.block.i.i
>    paddd    %xmm1, %xmm0
>    movdqa    %xmm0, %xmm1
>    movhlps    %xmm1, %xmm1            # xmm1 = xmm1[1,1]
>    paddd    %xmm0, %xmm1
>    phaddd    %xmm1, %xmm1
>    movd    %xmm1, %ebx
>    cmpq    %rcx, %rdx
>    je    .LBB0_6
>    movq    %rax, %rcx
>    addq    $-12, %rcx
> .LBB0_5:                                # %scalar.ph.i.i
>    addl    12(%rcx), %ebx
>    addq    $4, %rcx
>    cmpq    %rcx, %rax
>    jne    .LBB0_5
> .LBB0_6:                                # %_ZL4calcSt6vectorIiSaIiEERKi.exit
>    testq    %rax, %rax
>    je    .LBB0_8
>    movq    %rax, %rdi
>    callq    operator delete(void*)
> .LBB0_8:                                # %_ZNSt6vectorIiSaIiEED1Ev.exit
>    movl    %ebx, %eax
>    popq    %rbx
>    ret
> 
> is the llvm optimizer not able to optimize this better, is that a better result or do i something wrong here
> 
> thx
> 
> 
> 
> 
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-dev