[LLVMbugs] [Bug 12392] New: Poor performance due to register spilling when no spilling is necessary

Wed Mar 28 22:20:10 PDT 2012

http://llvm.org/bugs/show_bug.cgi?id=12392

             Bug #: 12392
           Summary: Poor performance due to register spilling when no
                    spilling is necessary
           Product: new-bugs
           Version: 3.0
          Platform: PC
        OS/Version: Windows XP
            Status: NEW
          Severity: normal
          Priority: P
         Component: new bugs
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: brenthwalker at gmail.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

I have run into the following strange llvm behavior.  For the C program below,
function sum() gets inlined in foo() but the code generated looks very
suboptimal (the code is an extract from a larger program).

Below I show the 32-bit x86 assembly as produced by the demo page on
the llvm home page ("Output A").  As you can see from the assembly,
after sum() is inlined and the loop unrolled, the generated code
loads all values of array v (aka &x[i]) into registers before adding
any numbers up -- in the process it runs out of registers and starts
spilling (in essense copying the doubles from one area of memory to
another).  After that, it proceeds to add the numbers up.

But why not add the numbers into 1 register directly?  Clearly this is
what the C code is doing -- nothing could have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects.  I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.

It's all very strange and it points to some questionable decision
making on the part of llvm.  I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p = 5 (instead of p[i] = 5), the problem also
goes away.

Until a fix comes, if you have a suggestion on how to get around this problem
please let me know.

Here is the code:

double sum( double* v, int v_siz )
{
   double sum = 0.0;
   int i = 0;

   for (; i != v_siz; ++i)
       sum += v[i];

   return sum;
}

double foo(double *x, int *p, int k)
{
   double s = 0.0;
   for (int i = 0; i != k;++i)
   {
      s += sum(&x[i], 18);
      p[i] = 5;   // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
   }
   return s;
}

====== Output A ======
======================
foo:                                    # @foo
.Ltmp12:
       .cfi_startproc
# BB#0:
       pushl   %ebx
.Ltmp13:
       .cfi_def_cfa_offset 8
       pushl   %edi
.Ltmp14:
       .cfi_def_cfa_offset 12
       pushl   %esi
.Ltmp15:
       .cfi_def_cfa_offset 16
       subl    $88, %esp
.Ltmp16:
       .cfi_def_cfa_offset 104
.Ltmp17:
       .cfi_offset %esi, -16
.Ltmp18:
       .cfi_offset %edi, -12
.Ltmp19:
       .cfi_offset %ebx, -8
       pxor    %xmm0, %xmm0
       movl    112(%esp), %eax
       testl   %eax, %eax
       je      .LBB1_3
# BB#1:
       xorl    %ebx, %ebx
       movl    108(%esp), %ecx
       movl    104(%esp), %edx
       xorl    %esi, %esi
       .align  16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                       # =>This Inner Loop Header: Depth=1
       movsd   (%edx,%ebx,8), %xmm2
       addsd   .LCPI1_0, %xmm2
       movsd   16(%edx,%ebx,8), %xmm1
       movsd   %xmm1, (%esp)           # 8-byte Spill
       movl    %ebx, %edi
       addl    $1, %edi
       addsd   (%edx,%edi,8), %xmm2
       movsd   136(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 72(%esp)         # 8-byte Spill
       movsd   128(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 64(%esp)         # 8-byte Spill
       movsd   120(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 56(%esp)         # 8-byte Spill
       movsd   112(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 48(%esp)         # 8-byte Spill
       movsd   104(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 40(%esp)         # 8-byte Spill
       movsd   96(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 32(%esp)         # 8-byte Spill
       movsd   88(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 24(%esp)         # 8-byte Spill
       movsd   80(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 16(%esp)         # 8-byte Spill
       movsd   72(%edx,%ebx,8), %xmm1
       movsd   %xmm1, 8(%esp)          # 8-byte Spill
       movsd   64(%edx,%ebx,8), %xmm7
       movsd   56(%edx,%ebx,8), %xmm1
       movsd   48(%edx,%ebx,8), %xmm3
       movsd   40(%edx,%ebx,8), %xmm4
       movsd   32(%edx,%ebx,8), %xmm5
       movsd   24(%edx,%ebx,8), %xmm6
       movl    $5, (%ecx,%ebx,4)
       addsd   (%esp), %xmm2           # 8-byte Folded Reload
       addsd   %xmm6, %xmm2
       addsd   %xmm5, %xmm2
       addsd   %xmm4, %xmm2
       addsd   %xmm3, %xmm2
       addsd   %xmm1, %xmm2
       addsd   %xmm7, %xmm2
       addsd   8(%esp), %xmm2          # 8-byte Folded Reload
       addsd   16(%esp), %xmm2         # 8-byte Folded Reload
       addsd   24(%esp), %xmm2         # 8-byte Folded Reload
       addsd   32(%esp), %xmm2         # 8-byte Folded Reload
       addsd   40(%esp), %xmm2         # 8-byte Folded Reload
       addsd   48(%esp), %xmm2         # 8-byte Folded Reload
       addsd   56(%esp), %xmm2         # 8-byte Folded Reload
       addsd   64(%esp), %xmm2         # 8-byte Folded Reload
       addsd   72(%esp), %xmm2         # 8-byte Folded Reload
       addsd   %xmm2, %xmm0
       adcl    $0, %esi
       cmpl    %eax, %edi
       movl    %edi, %ebx
       jne     .LBB1_2
.LBB1_3:                                # %._crit_edge
       movsd   %xmm0, 80(%esp)
       fldl    80(%esp)
       addl    $88, %esp
       popl    %esi
       popl    %edi
       popl    %ebx
       ret
.Ltmp20:
       .size   foo, .Ltmp20-foo
.Ltmp21:
       .cfi_endproc
.Leh_func_end1:

====== Output B ======
======================

foo:                                    # @foo
.Ltmp11:
       .cfi_startproc
# BB#0:
       pushl   %edi
.Ltmp12:
       .cfi_def_cfa_offset 8
       pushl   %esi
.Ltmp13:
       .cfi_def_cfa_offset 12
       subl    $12, %esp
.Ltmp14:
       .cfi_def_cfa_offset 24
.Ltmp15:
       .cfi_offset %esi, -12
.Ltmp16:
       .cfi_offset %edi, -8
       pxor    %xmm0, %xmm0
       movl    32(%esp), %eax
       testl   %eax, %eax
       je      .LBB1_3
# BB#1:
       xorl    %esi, %esi
       movl    24(%esp), %ecx
       pxor    %xmm1, %xmm1
       xorl    %edx, %edx
       .align  16, 0x90
.LBB1_2:                                # %.lr.ph.i
                                       # =>This Inner Loop Header: Depth=1
       movsd   (%ecx,%esi,8), %xmm2
       addsd   %xmm1, %xmm2
       movl    %esi, %edi
       addl    $1, %edi
       addsd   (%ecx,%edi,8), %xmm2
       addsd   16(%ecx,%esi,8), %xmm2
       addsd   24(%ecx,%esi,8), %xmm2
       addsd   32(%ecx,%esi,8), %xmm2
       addsd   40(%ecx,%esi,8), %xmm2
       addsd   48(%ecx,%esi,8), %xmm2
       addsd   56(%ecx,%esi,8), %xmm2
       addsd   64(%ecx,%esi,8), %xmm2
       addsd   72(%ecx,%esi,8), %xmm2
       addsd   80(%ecx,%esi,8), %xmm2
       addsd   88(%ecx,%esi,8), %xmm2
       addsd   96(%ecx,%esi,8), %xmm2
       addsd   104(%ecx,%esi,8), %xmm2
       addsd   112(%ecx,%esi,8), %xmm2
       addsd   120(%ecx,%esi,8), %xmm2
       addsd   128(%ecx,%esi,8), %xmm2
       addsd   136(%ecx,%esi,8), %xmm2
       addsd   %xmm2, %xmm0
       adcl    $0, %edx
       cmpl    %eax, %edi
       movl    %edi, %esi
       jne     .LBB1_2
.LBB1_3:                                # %._crit_edge
       movsd   %xmm0, (%esp)
       fldl    (%esp)
       addl    $12, %esp
       popl    %esi
       popl    %edi
       ret

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.