[LLVMdev] Suboptimal code due to excessive spilling
Brent Walker
brenthwalker at gmail.com
Thu Apr 5 09:14:56 PDT 2012
It would be unless one actually wanted the unroll to happen. Thank
you for the suggestion though. I have filed a bug
(http://llvm.org/bugs/show_bug.cgi?id=12392) so hopefully one of the
developers can find the bug or suggest a work around that avoids the
issue.
Brent
On Thu, Apr 5, 2012 at 7:36 PM, Patrik Hägglund H
<patrik.h.hagglund at ericsson.com> wrote:
> I don't know much about this, but maybe -mllvm -unroll-count=1 can be used as a workaround?
>
> /Patrik Hägglund
>
> -----Original Message-----
> From: llvmdev-bounces at cs.uiuc.edu [mailto:llvmdev-bounces at cs.uiuc.edu] On Behalf Of Brent Walker
> Sent: den 28 mars 2012 03:18
> To: llvmdev
> Subject: [LLVMdev] Suboptimal code due to excessive spilling
>
> Hi,
>
> I have run into the following strange behavior and wanted to ask for
> some advice. For the C program below, function sum() gets inlined in
> foo() but the code generated looks very suboptimal (the code is an
> extract from a larger program).
>
> Below I show the 32-bit x86 assembly as produced by the demo page on
> the llvm home page ("Output A"). As you can see from the assembly,
> after sum() is inlined and the loop unrolled, the generated code
> loads all values of array v (aka &x[i]) into registers before adding
> any numbers up -- in the process it runs out of registers and starts
> spilling (in essense copying the doubles from one area of memory to
> another). After that, it proceeds to add the numbers up.
>
> But why not add the numbers into 1 register directly? Clearly this is
> what the C code is doing -- nothing could have been more explicit.
> The really strange thing, is that in the assingment to p[i] is removed
> (line marked with "xxx..."), then the code produced is optimal and
> exactly what one expects. I show this result in "Output B" where you
> get a beatiful sequence of addsd into register xmm2.
>
> It's all very strange and it points to some questionable decision
> making on the part of llvm. I tried different versions of the sum()
> function (elliminating the loop for example) but it does not help.
> Another observation is that the loop variable i (in foo) must be
> involved: if one does *p = 5 (instead of p[i] = 5), the problem also
> goes away.
>
> I would appreciate some advice on how to get around this problem.
>
> Thank you for any help,
> Brent
>
>
> double sum( double* v, int v_siz )
> {
> double sum = 0.0;
> int i = 0;
>
> for (; i != v_siz; ++i)
> sum += v[i];
>
> return sum;
> }
>
> double foo(double *x, int *p, int k)
> {
> double s = 0.0;
> for (int i = 0; i != k;++i)
> {
> s += sum(&x[i], 18);
> p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
> }
> return s;
> }
>
> ====== Output A ======
> ======================
> foo: # @foo
> .Ltmp12:
> .cfi_startproc
> # BB#0:
> pushl %ebx
> .Ltmp13:
> .cfi_def_cfa_offset 8
> pushl %edi
> .Ltmp14:
> .cfi_def_cfa_offset 12
> pushl %esi
> .Ltmp15:
> .cfi_def_cfa_offset 16
> subl $88, %esp
> .Ltmp16:
> .cfi_def_cfa_offset 104
> .Ltmp17:
> .cfi_offset %esi, -16
> .Ltmp18:
> .cfi_offset %edi, -12
> .Ltmp19:
> .cfi_offset %ebx, -8
> pxor %xmm0, %xmm0
> movl 112(%esp), %eax
> testl %eax, %eax
> je .LBB1_3
> # BB#1:
> xorl %ebx, %ebx
> movl 108(%esp), %ecx
> movl 104(%esp), %edx
> xorl %esi, %esi
> .align 16, 0x90
> .LBB1_2: # %.lr.ph.i
> # =>This Inner Loop Header: Depth=1
> movsd (%edx,%ebx,8), %xmm2
> addsd .LCPI1_0, %xmm2
> movsd 16(%edx,%ebx,8), %xmm1
> movsd %xmm1, (%esp) # 8-byte Spill
> movl %ebx, %edi
> addl $1, %edi
> addsd (%edx,%edi,8), %xmm2
> movsd 136(%edx,%ebx,8), %xmm1
> movsd %xmm1, 72(%esp) # 8-byte Spill
> movsd 128(%edx,%ebx,8), %xmm1
> movsd %xmm1, 64(%esp) # 8-byte Spill
> movsd 120(%edx,%ebx,8), %xmm1
> movsd %xmm1, 56(%esp) # 8-byte Spill
> movsd 112(%edx,%ebx,8), %xmm1
> movsd %xmm1, 48(%esp) # 8-byte Spill
> movsd 104(%edx,%ebx,8), %xmm1
> movsd %xmm1, 40(%esp) # 8-byte Spill
> movsd 96(%edx,%ebx,8), %xmm1
> movsd %xmm1, 32(%esp) # 8-byte Spill
> movsd 88(%edx,%ebx,8), %xmm1
> movsd %xmm1, 24(%esp) # 8-byte Spill
> movsd 80(%edx,%ebx,8), %xmm1
> movsd %xmm1, 16(%esp) # 8-byte Spill
> movsd 72(%edx,%ebx,8), %xmm1
> movsd %xmm1, 8(%esp) # 8-byte Spill
> movsd 64(%edx,%ebx,8), %xmm7
> movsd 56(%edx,%ebx,8), %xmm1
> movsd 48(%edx,%ebx,8), %xmm3
> movsd 40(%edx,%ebx,8), %xmm4
> movsd 32(%edx,%ebx,8), %xmm5
> movsd 24(%edx,%ebx,8), %xmm6
> movl $5, (%ecx,%ebx,4)
> addsd (%esp), %xmm2 # 8-byte Folded Reload
> addsd %xmm6, %xmm2
> addsd %xmm5, %xmm2
> addsd %xmm4, %xmm2
> addsd %xmm3, %xmm2
> addsd %xmm1, %xmm2
> addsd %xmm7, %xmm2
> addsd 8(%esp), %xmm2 # 8-byte Folded Reload
> addsd 16(%esp), %xmm2 # 8-byte Folded Reload
> addsd 24(%esp), %xmm2 # 8-byte Folded Reload
> addsd 32(%esp), %xmm2 # 8-byte Folded Reload
> addsd 40(%esp), %xmm2 # 8-byte Folded Reload
> addsd 48(%esp), %xmm2 # 8-byte Folded Reload
> addsd 56(%esp), %xmm2 # 8-byte Folded Reload
> addsd 64(%esp), %xmm2 # 8-byte Folded Reload
> addsd 72(%esp), %xmm2 # 8-byte Folded Reload
> addsd %xmm2, %xmm0
> adcl $0, %esi
> cmpl %eax, %edi
> movl %edi, %ebx
> jne .LBB1_2
> .LBB1_3: # %._crit_edge
> movsd %xmm0, 80(%esp)
> fldl 80(%esp)
> addl $88, %esp
> popl %esi
> popl %edi
> popl %ebx
> ret
> .Ltmp20:
> .size foo, .Ltmp20-foo
> .Ltmp21:
> .cfi_endproc
> .Leh_func_end1:
>
>
> ====== Output B ======
> ======================
>
> foo: # @foo
> .Ltmp11:
> .cfi_startproc
> # BB#0:
> pushl %edi
> .Ltmp12:
> .cfi_def_cfa_offset 8
> pushl %esi
> .Ltmp13:
> .cfi_def_cfa_offset 12
> subl $12, %esp
> .Ltmp14:
> .cfi_def_cfa_offset 24
> .Ltmp15:
> .cfi_offset %esi, -12
> .Ltmp16:
> .cfi_offset %edi, -8
> pxor %xmm0, %xmm0
> movl 32(%esp), %eax
> testl %eax, %eax
> je .LBB1_3
> # BB#1:
> xorl %esi, %esi
> movl 24(%esp), %ecx
> pxor %xmm1, %xmm1
> xorl %edx, %edx
> .align 16, 0x90
> .LBB1_2: # %.lr.ph.i
> # =>This Inner Loop Header: Depth=1
> movsd (%ecx,%esi,8), %xmm2
> addsd %xmm1, %xmm2
> movl %esi, %edi
> addl $1, %edi
> addsd (%ecx,%edi,8), %xmm2
> addsd 16(%ecx,%esi,8), %xmm2
> addsd 24(%ecx,%esi,8), %xmm2
> addsd 32(%ecx,%esi,8), %xmm2
> addsd 40(%ecx,%esi,8), %xmm2
> addsd 48(%ecx,%esi,8), %xmm2
> addsd 56(%ecx,%esi,8), %xmm2
> addsd 64(%ecx,%esi,8), %xmm2
> addsd 72(%ecx,%esi,8), %xmm2
> addsd 80(%ecx,%esi,8), %xmm2
> addsd 88(%ecx,%esi,8), %xmm2
> addsd 96(%ecx,%esi,8), %xmm2
> addsd 104(%ecx,%esi,8), %xmm2
> addsd 112(%ecx,%esi,8), %xmm2
> addsd 120(%ecx,%esi,8), %xmm2
> addsd 128(%ecx,%esi,8), %xmm2
> addsd 136(%ecx,%esi,8), %xmm2
> addsd %xmm2, %xmm0
> adcl $0, %edx
> cmpl %eax, %edi
> movl %edi, %esi
> jne .LBB1_2
> .LBB1_3: # %._crit_edge
> movsd %xmm0, (%esp)
> fldl (%esp)
> addl $12, %esp
> popl %esi
> popl %edi
> ret
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
More information about the llvm-dev
mailing list