[LLVMdev] Suboptimal code due to excessive spilling

Tue Mar 27 18:17:46 PDT 2012

Hi,

I have run into the following strange behavior and wanted to ask for
some advice.  For the C program below, function sum() gets inlined in
foo() but the code generated looks very suboptimal (the code is an
extract from a larger program).

Below I show the 32-bit x86 assembly as produced by the demo page on
the llvm home page ("Output A").  As you can see from the assembly,
after sum() is inlined and the loop unrolled, the generated code
loads all values of array v (aka &x[i]) into registers before adding
any numbers up -- in the process it runs out of registers and starts
spilling (in essense copying the doubles from one area of memory to
another).  After that, it proceeds to add the numbers up.

But why not add the numbers into 1 register directly?  Clearly this is
what the C code is doing -- nothing could have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects.  I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.

It's all very strange and it points to some questionable decision
making on the part of llvm.  I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p = 5 (instead of p[i] = 5), the problem also
goes away.

I would appreciate some advice on how to get around this problem.

Thank you for any help,
Brent

double sum( double* v, int v_siz )
{
    double sum = 0.0;
    int i = 0;

    for (; i != v_siz; ++i)
        sum += v[i];

    return sum;
}

double foo(double *x, int *p, int k)
{
    double s = 0.0;
    for (int i = 0; i != k;++i)
    {
       s += sum(&x[i], 18);
       p[i] = 5;   // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    }
    return s;
}

====== Output A ======
======================
foo:                                    # @foo
.Ltmp12:
	.cfi_startproc
# BB#0:
	pushl	%ebx
.Ltmp13:
	.cfi_def_cfa_offset 8
	pushl	%edi
.Ltmp14:
	.cfi_def_cfa_offset 12
	pushl	%esi
.Ltmp15:
	.cfi_def_cfa_offset 16
	subl	$88, %esp
.Ltmp16:
	.cfi_def_cfa_offset 104
.Ltmp17:
	.cfi_offset %esi, -16
.Ltmp18:
	.cfi_offset %edi, -12
.Ltmp19:
	.cfi_offset %ebx, -8
	pxor	%xmm0, %xmm0
	movl	112(%esp), %eax
	testl	%eax, %eax
	je	.LBB1_3
# BB#1:
	xorl	%ebx, %ebx
	movl	108(%esp), %ecx
	movl	104(%esp), %edx
	xorl	%esi, %esi
	.align	16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                        # =>This Inner Loop Header: Depth=1
	movsd	(%edx,%ebx,8), %xmm2
	addsd	.LCPI1_0, %xmm2
	movsd	16(%edx,%ebx,8), %xmm1
	movsd	%xmm1, (%esp)           # 8-byte Spill
	movl	%ebx, %edi
	addl	$1, %edi
	addsd	(%edx,%edi,8), %xmm2
	movsd	136(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 72(%esp)         # 8-byte Spill
	movsd	128(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 64(%esp)         # 8-byte Spill
	movsd	120(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 56(%esp)         # 8-byte Spill
	movsd	112(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 48(%esp)         # 8-byte Spill
	movsd	104(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 40(%esp)         # 8-byte Spill
	movsd	96(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 32(%esp)         # 8-byte Spill
	movsd	88(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 24(%esp)         # 8-byte Spill
	movsd	80(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 16(%esp)         # 8-byte Spill
	movsd	72(%edx,%ebx,8), %xmm1
	movsd	%xmm1, 8(%esp)          # 8-byte Spill
	movsd	64(%edx,%ebx,8), %xmm7
	movsd	56(%edx,%ebx,8), %xmm1
	movsd	48(%edx,%ebx,8), %xmm3
	movsd	40(%edx,%ebx,8), %xmm4
	movsd	32(%edx,%ebx,8), %xmm5
	movsd	24(%edx,%ebx,8), %xmm6
	movl	$5, (%ecx,%ebx,4)
	addsd	(%esp), %xmm2           # 8-byte Folded Reload
	addsd	%xmm6, %xmm2
	addsd	%xmm5, %xmm2
	addsd	%xmm4, %xmm2
	addsd	%xmm3, %xmm2
	addsd	%xmm1, %xmm2
	addsd	%xmm7, %xmm2
	addsd	8(%esp), %xmm2          # 8-byte Folded Reload
	addsd	16(%esp), %xmm2         # 8-byte Folded Reload
	addsd	24(%esp), %xmm2         # 8-byte Folded Reload
	addsd	32(%esp), %xmm2         # 8-byte Folded Reload
	addsd	40(%esp), %xmm2         # 8-byte Folded Reload
	addsd	48(%esp), %xmm2         # 8-byte Folded Reload
	addsd	56(%esp), %xmm2         # 8-byte Folded Reload
	addsd	64(%esp), %xmm2         # 8-byte Folded Reload
	addsd	72(%esp), %xmm2         # 8-byte Folded Reload
	addsd	%xmm2, %xmm0
	adcl	$0, %esi
	cmpl	%eax, %edi
	movl	%edi, %ebx
	jne	.LBB1_2
.LBB1_3:                                # %._crit_edge
	movsd	%xmm0, 80(%esp)
	fldl	80(%esp)
	addl	$88, %esp
	popl	%esi
	popl	%edi
	popl	%ebx
	ret
.Ltmp20:
	.size	foo, .Ltmp20-foo
.Ltmp21:
	.cfi_endproc
.Leh_func_end1:

====== Output B ======
======================

foo:                                    # @foo
.Ltmp11:
	.cfi_startproc
# BB#0:
	pushl	%edi
.Ltmp12:
	.cfi_def_cfa_offset 8
	pushl	%esi
.Ltmp13:
	.cfi_def_cfa_offset 12
	subl	$12, %esp
.Ltmp14:
	.cfi_def_cfa_offset 24
.Ltmp15:
	.cfi_offset %esi, -12
.Ltmp16:
	.cfi_offset %edi, -8
	pxor	%xmm0, %xmm0
	movl	32(%esp), %eax
	testl	%eax, %eax
	je	.LBB1_3
# BB#1:
	xorl	%esi, %esi
	movl	24(%esp), %ecx
	pxor	%xmm1, %xmm1
	xorl	%edx, %edx
	.align	16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                        # =>This Inner Loop Header: Depth=1
	movsd	(%ecx,%esi,8), %xmm2
	addsd	%xmm1, %xmm2
	movl	%esi, %edi
	addl	$1, %edi
	addsd	(%ecx,%edi,8), %xmm2
	addsd	16(%ecx,%esi,8), %xmm2
	addsd	24(%ecx,%esi,8), %xmm2
	addsd	32(%ecx,%esi,8), %xmm2
	addsd	40(%ecx,%esi,8), %xmm2
	addsd	48(%ecx,%esi,8), %xmm2
	addsd	56(%ecx,%esi,8), %xmm2
	addsd	64(%ecx,%esi,8), %xmm2
	addsd	72(%ecx,%esi,8), %xmm2
	addsd	80(%ecx,%esi,8), %xmm2
	addsd	88(%ecx,%esi,8), %xmm2
	addsd	96(%ecx,%esi,8), %xmm2
	addsd	104(%ecx,%esi,8), %xmm2
	addsd	112(%ecx,%esi,8), %xmm2
	addsd	120(%ecx,%esi,8), %xmm2
	addsd	128(%ecx,%esi,8), %xmm2
	addsd	136(%ecx,%esi,8), %xmm2
	addsd	%xmm2, %xmm0
	adcl	$0, %edx
	cmpl	%eax, %edi
	movl	%edi, %esi
	jne	.LBB1_2
.LBB1_3:                                # %._crit_edge
	movsd	%xmm0, (%esp)
	fldl	(%esp)
	addl	$12, %esp
	popl	%esi
	popl	%edi
	ret