[LLVMdev] Suboptimal code due to excessive spilling
Brent Walker
brenthwalker at gmail.com
Tue Mar 27 18:17:46 PDT 2012
Hi,
I have run into the following strange behavior and wanted to ask for
some advice. For the C program below, function sum() gets inlined in
foo() but the code generated looks very suboptimal (the code is an
extract from a larger program).
Below I show the 32-bit x86 assembly as produced by the demo page on
the llvm home page ("Output A"). As you can see from the assembly,
after sum() is inlined and the loop unrolled, the generated code
loads all values of array v (aka &x[i]) into registers before adding
any numbers up -- in the process it runs out of registers and starts
spilling (in essense copying the doubles from one area of memory to
another). After that, it proceeds to add the numbers up.
But why not add the numbers into 1 register directly? Clearly this is
what the C code is doing -- nothing could have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects. I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.
It's all very strange and it points to some questionable decision
making on the part of llvm. I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p = 5 (instead of p[i] = 5), the problem also
goes away.
I would appreciate some advice on how to get around this problem.
Thank you for any help,
Brent
double sum( double* v, int v_siz )
{
double sum = 0.0;
int i = 0;
for (; i != v_siz; ++i)
sum += v[i];
return sum;
}
double foo(double *x, int *p, int k)
{
double s = 0.0;
for (int i = 0; i != k;++i)
{
s += sum(&x[i], 18);
p[i] = 5; // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
}
return s;
}
====== Output A ======
======================
foo: # @foo
.Ltmp12:
.cfi_startproc
# BB#0:
pushl %ebx
.Ltmp13:
.cfi_def_cfa_offset 8
pushl %edi
.Ltmp14:
.cfi_def_cfa_offset 12
pushl %esi
.Ltmp15:
.cfi_def_cfa_offset 16
subl $88, %esp
.Ltmp16:
.cfi_def_cfa_offset 104
.Ltmp17:
.cfi_offset %esi, -16
.Ltmp18:
.cfi_offset %edi, -12
.Ltmp19:
.cfi_offset %ebx, -8
pxor %xmm0, %xmm0
movl 112(%esp), %eax
testl %eax, %eax
je .LBB1_3
# BB#1:
xorl %ebx, %ebx
movl 108(%esp), %ecx
movl 104(%esp), %edx
xorl %esi, %esi
.align 16, 0x90
.LBB1_2: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%edx,%ebx,8), %xmm2
addsd .LCPI1_0, %xmm2
movsd 16(%edx,%ebx,8), %xmm1
movsd %xmm1, (%esp) # 8-byte Spill
movl %ebx, %edi
addl $1, %edi
addsd (%edx,%edi,8), %xmm2
movsd 136(%edx,%ebx,8), %xmm1
movsd %xmm1, 72(%esp) # 8-byte Spill
movsd 128(%edx,%ebx,8), %xmm1
movsd %xmm1, 64(%esp) # 8-byte Spill
movsd 120(%edx,%ebx,8), %xmm1
movsd %xmm1, 56(%esp) # 8-byte Spill
movsd 112(%edx,%ebx,8), %xmm1
movsd %xmm1, 48(%esp) # 8-byte Spill
movsd 104(%edx,%ebx,8), %xmm1
movsd %xmm1, 40(%esp) # 8-byte Spill
movsd 96(%edx,%ebx,8), %xmm1
movsd %xmm1, 32(%esp) # 8-byte Spill
movsd 88(%edx,%ebx,8), %xmm1
movsd %xmm1, 24(%esp) # 8-byte Spill
movsd 80(%edx,%ebx,8), %xmm1
movsd %xmm1, 16(%esp) # 8-byte Spill
movsd 72(%edx,%ebx,8), %xmm1
movsd %xmm1, 8(%esp) # 8-byte Spill
movsd 64(%edx,%ebx,8), %xmm7
movsd 56(%edx,%ebx,8), %xmm1
movsd 48(%edx,%ebx,8), %xmm3
movsd 40(%edx,%ebx,8), %xmm4
movsd 32(%edx,%ebx,8), %xmm5
movsd 24(%edx,%ebx,8), %xmm6
movl $5, (%ecx,%ebx,4)
addsd (%esp), %xmm2 # 8-byte Folded Reload
addsd %xmm6, %xmm2
addsd %xmm5, %xmm2
addsd %xmm4, %xmm2
addsd %xmm3, %xmm2
addsd %xmm1, %xmm2
addsd %xmm7, %xmm2
addsd 8(%esp), %xmm2 # 8-byte Folded Reload
addsd 16(%esp), %xmm2 # 8-byte Folded Reload
addsd 24(%esp), %xmm2 # 8-byte Folded Reload
addsd 32(%esp), %xmm2 # 8-byte Folded Reload
addsd 40(%esp), %xmm2 # 8-byte Folded Reload
addsd 48(%esp), %xmm2 # 8-byte Folded Reload
addsd 56(%esp), %xmm2 # 8-byte Folded Reload
addsd 64(%esp), %xmm2 # 8-byte Folded Reload
addsd 72(%esp), %xmm2 # 8-byte Folded Reload
addsd %xmm2, %xmm0
adcl $0, %esi
cmpl %eax, %edi
movl %edi, %ebx
jne .LBB1_2
.LBB1_3: # %._crit_edge
movsd %xmm0, 80(%esp)
fldl 80(%esp)
addl $88, %esp
popl %esi
popl %edi
popl %ebx
ret
.Ltmp20:
.size foo, .Ltmp20-foo
.Ltmp21:
.cfi_endproc
.Leh_func_end1:
====== Output B ======
======================
foo: # @foo
.Ltmp11:
.cfi_startproc
# BB#0:
pushl %edi
.Ltmp12:
.cfi_def_cfa_offset 8
pushl %esi
.Ltmp13:
.cfi_def_cfa_offset 12
subl $12, %esp
.Ltmp14:
.cfi_def_cfa_offset 24
.Ltmp15:
.cfi_offset %esi, -12
.Ltmp16:
.cfi_offset %edi, -8
pxor %xmm0, %xmm0
movl 32(%esp), %eax
testl %eax, %eax
je .LBB1_3
# BB#1:
xorl %esi, %esi
movl 24(%esp), %ecx
pxor %xmm1, %xmm1
xorl %edx, %edx
.align 16, 0x90
.LBB1_2: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%ecx,%esi,8), %xmm2
addsd %xmm1, %xmm2
movl %esi, %edi
addl $1, %edi
addsd (%ecx,%edi,8), %xmm2
addsd 16(%ecx,%esi,8), %xmm2
addsd 24(%ecx,%esi,8), %xmm2
addsd 32(%ecx,%esi,8), %xmm2
addsd 40(%ecx,%esi,8), %xmm2
addsd 48(%ecx,%esi,8), %xmm2
addsd 56(%ecx,%esi,8), %xmm2
addsd 64(%ecx,%esi,8), %xmm2
addsd 72(%ecx,%esi,8), %xmm2
addsd 80(%ecx,%esi,8), %xmm2
addsd 88(%ecx,%esi,8), %xmm2
addsd 96(%ecx,%esi,8), %xmm2
addsd 104(%ecx,%esi,8), %xmm2
addsd 112(%ecx,%esi,8), %xmm2
addsd 120(%ecx,%esi,8), %xmm2
addsd 128(%ecx,%esi,8), %xmm2
addsd 136(%ecx,%esi,8), %xmm2
addsd %xmm2, %xmm0
adcl $0, %edx
cmpl %eax, %edi
movl %edi, %esi
jne .LBB1_2
.LBB1_3: # %._crit_edge
movsd %xmm0, (%esp)
fldl (%esp)
addl $12, %esp
popl %esi
popl %edi
ret
More information about the llvm-dev
mailing list