[LLVMdev] Suboptimal code due to excessive spilling

Thu Apr 5 09:14:56 PDT 2012

It would be unless one actually wanted the unroll to happen.  Thank
you for the suggestion though.  I have filed a bug
(http://llvm.org/bugs/show_bug.cgi?id=12392) so hopefully one of the
developers can find the bug or suggest a work around that avoids the
issue.

Brent

On Thu, Apr 5, 2012 at 7:36 PM, Patrik Hägglund H
<patrik.h.hagglund at ericsson.com> wrote:
> I don't know much about this, but maybe -mllvm -unroll-count=1 can be used as a workaround?
>
> /Patrik Hägglund
>
> -----Original Message-----
> From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at cs.uiuc.edu] On Behalf Of Brent Walker
> Sent: den 28 mars 2012 03:18
> To: llvmdev
> Subject: [LLVMdev] Suboptimal code due to excessive spilling
>
> Hi,
>
> I have run into the following strange behavior and wanted to ask for
> some advice.  For the C program below, function sum() gets inlined in
> foo() but the code generated looks very suboptimal (the code is an
> extract from a larger program).
>
> Below I show the 32-bit x86 assembly as produced by the demo page on
> the llvm home page ("Output A").  As you can see from the assembly,
> after sum() is inlined and the loop unrolled, the generated code
> loads all values of array v (aka &x[i]) into registers before adding
> any numbers up -- in the process it runs out of registers and starts
> spilling (in essense copying the doubles from one area of memory to
> another).  After that, it proceeds to add the numbers up.
>
> But why not add the numbers into 1 register directly?  Clearly this is
> what the C code is doing -- nothing could have been more explicit.
> The really strange thing, is that in the assingment to p[i] is removed
> (line marked with "xxx..."), then the code produced is optimal and
> exactly what one expects.  I show this result in "Output B" where you
> get a beatiful sequence of addsd into register xmm2.
>
> It's all very strange and it points to some questionable decision
> making on the part of llvm.  I tried different versions of the sum()
> function (elliminating the loop for example) but it does not help.
> Another observation is that the loop variable i (in foo) must be
> involved: if one does *p = 5 (instead of p[i] = 5), the problem also
> goes away.
>
> I would appreciate some advice on how to get around this problem.
>
> Thank you for any help,
> Brent
>
>
> double sum( double* v, int v_siz )
> {
>    double sum = 0.0;
>    int i = 0;
>
>    for (; i != v_siz; ++i)
>        sum += v[i];
>
>    return sum;
> }
>
> double foo(double *x, int *p, int k)
> {
>    double s = 0.0;
>    for (int i = 0; i != k;++i)
>    {
>       s += sum(&x[i], 18);
>       p[i] = 5;   // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
>    }
>    return s;
> }
>
> ====== Output A ======
> ======================
> foo:                                    # @foo
> .Ltmp12:
>        .cfi_startproc
> # BB#0:
>        pushl   %ebx
> .Ltmp13:
>        .cfi_def_cfa_offset 8
>        pushl   %edi
> .Ltmp14:
>        .cfi_def_cfa_offset 12
>        pushl   %esi
> .Ltmp15:
>        .cfi_def_cfa_offset 16
>        subl    $88, %esp
> .Ltmp16:
>        .cfi_def_cfa_offset 104
> .Ltmp17:
>        .cfi_offset %esi, -16
> .Ltmp18:
>        .cfi_offset %edi, -12
> .Ltmp19:
>        .cfi_offset %ebx, -8
>        pxor    %xmm0, %xmm0
>        movl    112(%esp), %eax
>        testl   %eax, %eax
>        je      .LBB1_3
> # BB#1:
>        xorl    %ebx, %ebx
>        movl    108(%esp), %ecx
>        movl    104(%esp), %edx
>        xorl    %esi, %esi
>        .align  16, 0x90
> .LBB1_2:                                # %.lr.ph.i
>                                        # =>This Inner Loop Header: Depth=1
>        movsd   (%edx,%ebx,8), %xmm2
>        addsd   .LCPI1_0, %xmm2
>        movsd   16(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, (%esp)           # 8-byte Spill
>        movl    %ebx, %edi
>        addl    $1, %edi
>        addsd   (%edx,%edi,8), %xmm2
>        movsd   136(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 72(%esp)         # 8-byte Spill
>        movsd   128(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 64(%esp)         # 8-byte Spill
>        movsd   120(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 56(%esp)         # 8-byte Spill
>        movsd   112(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 48(%esp)         # 8-byte Spill
>        movsd   104(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 40(%esp)         # 8-byte Spill
>        movsd   96(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 32(%esp)         # 8-byte Spill
>        movsd   88(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 24(%esp)         # 8-byte Spill
>        movsd   80(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 16(%esp)         # 8-byte Spill
>        movsd   72(%edx,%ebx,8), %xmm1
>        movsd   %xmm1, 8(%esp)          # 8-byte Spill
>        movsd   64(%edx,%ebx,8), %xmm7
>        movsd   56(%edx,%ebx,8), %xmm1
>        movsd   48(%edx,%ebx,8), %xmm3
>        movsd   40(%edx,%ebx,8), %xmm4
>        movsd   32(%edx,%ebx,8), %xmm5
>        movsd   24(%edx,%ebx,8), %xmm6
>        movl    $5, (%ecx,%ebx,4)
>        addsd   (%esp), %xmm2           # 8-byte Folded Reload
>        addsd   %xmm6, %xmm2
>        addsd   %xmm5, %xmm2
>        addsd   %xmm4, %xmm2
>        addsd   %xmm3, %xmm2
>        addsd   %xmm1, %xmm2
>        addsd   %xmm7, %xmm2
>        addsd   8(%esp), %xmm2          # 8-byte Folded Reload
>        addsd   16(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   24(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   32(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   40(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   48(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   56(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   64(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   72(%esp), %xmm2         # 8-byte Folded Reload
>        addsd   %xmm2, %xmm0
>        adcl    $0, %esi
>        cmpl    %eax, %edi
>        movl    %edi, %ebx
>        jne     .LBB1_2
> .LBB1_3:                                # %._crit_edge
>        movsd   %xmm0, 80(%esp)
>        fldl    80(%esp)
>        addl    $88, %esp
>        popl    %esi
>        popl    %edi
>        popl    %ebx
>        ret
> .Ltmp20:
>        .size   foo, .Ltmp20-foo
> .Ltmp21:
>        .cfi_endproc
> .Leh_func_end1:
>
>
> ====== Output B ======
> ======================
>
> foo:                                    # @foo
> .Ltmp11:
>        .cfi_startproc
> # BB#0:
>        pushl   %edi
> .Ltmp12:
>        .cfi_def_cfa_offset 8
>        pushl   %esi
> .Ltmp13:
>        .cfi_def_cfa_offset 12
>        subl    $12, %esp
> .Ltmp14:
>        .cfi_def_cfa_offset 24
> .Ltmp15:
>        .cfi_offset %esi, -12
> .Ltmp16:
>        .cfi_offset %edi, -8
>        pxor    %xmm0, %xmm0
>        movl    32(%esp), %eax
>        testl   %eax, %eax
>        je      .LBB1_3
> # BB#1:
>        xorl    %esi, %esi
>        movl    24(%esp), %ecx
>        pxor    %xmm1, %xmm1
>        xorl    %edx, %edx
>        .align  16, 0x90
> .LBB1_2:                                # %.lr.ph.i
>                                        # =>This Inner Loop Header: Depth=1
>        movsd   (%ecx,%esi,8), %xmm2
>        addsd   %xmm1, %xmm2
>        movl    %esi, %edi
>        addl    $1, %edi
>        addsd   (%ecx,%edi,8), %xmm2
>        addsd   16(%ecx,%esi,8), %xmm2
>        addsd   24(%ecx,%esi,8), %xmm2
>        addsd   32(%ecx,%esi,8), %xmm2
>        addsd   40(%ecx,%esi,8), %xmm2
>        addsd   48(%ecx,%esi,8), %xmm2
>        addsd   56(%ecx,%esi,8), %xmm2
>        addsd   64(%ecx,%esi,8), %xmm2
>        addsd   72(%ecx,%esi,8), %xmm2
>        addsd   80(%ecx,%esi,8), %xmm2
>        addsd   88(%ecx,%esi,8), %xmm2
>        addsd   96(%ecx,%esi,8), %xmm2
>        addsd   104(%ecx,%esi,8), %xmm2
>        addsd   112(%ecx,%esi,8), %xmm2
>        addsd   120(%ecx,%esi,8), %xmm2
>        addsd   128(%ecx,%esi,8), %xmm2
>        addsd   136(%ecx,%esi,8), %xmm2
>        addsd   %xmm2, %xmm0
>        adcl    $0, %edx
>        cmpl    %eax, %edi
>        movl    %edi, %esi
>        jne     .LBB1_2
> .LBB1_3:                                # %._crit_edge
>        movsd   %xmm0, (%esp)
>        fldl    (%esp)
>        addl    $12, %esp
>        popl    %esi
>        popl    %edi
>        ret
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev