[LLVMbugs] [Bug 13095] New: Give an inline cost bonus to byval functions

Tue Jun 12 13:28:15 PDT 2012

http://llvm.org/bugs/show_bug.cgi?id=13095

             Bug #: 13095
           Summary: Give an inline cost bonus to byval functions
           Product: libraries
           Version: trunk
          Platform: PC
        OS/Version: All
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Interprocedural Optimizations
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: benny.kra at gmail.com
                CC: chandlerc at gmail.com, llvmbugs at cs.uiuc.edu
    Classification: Unclassified

A byval argument can be interpreted as a bundle of arguments. Inline cost gives
a bonus for every argument but it doesn't give a boost to byval arguments.

Take this simple example (it's a bit braindead so it can tickle the inliner,
real-world examples are more complicated)

----8<----
struct vec6 {
  long a, b, c, d, e, f, g, h;
};

void stopinliner(int);
/*__attribute__((always_inline))*/ void b(struct vec6 x) {
  stopinliner(x.a);
  stopinliner(x.b);
  stopinliner(x.c);
  stopinliner(x.d);
  stopinliner(x.e);
  stopinliner(x.f);
  stopinliner(x.g);
  stopinliner(x.h);
  stopinliner(x.a);
  stopinliner(x.a);
  stopinliner(x.a);
  stopinliner(x.a);
}

void c(void) {
  struct vec6 x = { 1, 2, 3, 4, 5, 6, 7, 8 };
  b(x);
}

--->8---

clang -O3 generates

---8<---

_b:                                     ## @b
    .cfi_startproc
## BB#0:                                ## %entry
    pushq    %rbp
Ltmp3:
    .cfi_def_cfa_offset 16
Ltmp4:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp5:
    .cfi_def_cfa_register %rbp
    pushq    %rbx
    pushq    %rax
Ltmp6:
    .cfi_offset %rbx, -24
    movl    16(%rbp), %ebx
    movl    %ebx, %edi
    callq    _stopinliner
    movl    24(%rbp), %edi
    callq    _stopinliner
    movl    32(%rbp), %edi
    callq    _stopinliner
    movl    40(%rbp), %edi
    callq    _stopinliner
    movl    48(%rbp), %edi
    callq    _stopinliner
    movl    56(%rbp), %edi
    callq    _stopinliner
    movl    64(%rbp), %edi
    callq    _stopinliner
    movl    72(%rbp), %edi
    callq    _stopinliner
    movl    %ebx, %edi
    callq    _stopinliner
    movl    %ebx, %edi
    callq    _stopinliner
    movl    %ebx, %edi
    callq    _stopinliner
    movl    %ebx, %edi
    addq    $8, %rsp
    popq    %rbx
    popq    %rbp
    jmp    _stopinliner            ## TAILCALL
    .cfi_endproc

    .globl    _c
    .align    4, 0x90
_c:                                     ## @c
    .cfi_startproc
## BB#0:                                ## %entry
    pushq    %rbp
Ltmp9:
    .cfi_def_cfa_offset 16
Ltmp10:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp11:
    .cfi_def_cfa_register %rbp
    subq    $64, %rsp
    movq    L_c.x+24(%rip), %rax
    movq    L_c.x+32(%rip), %rcx
    movq    L_c.x+40(%rip), %rdx
    movq    L_c.x+48(%rip), %rsi
    movq    L_c.x+56(%rip), %rdi
    movq    %rdi, 56(%rsp)
    movq    %rsi, 48(%rsp)
    movq    %rdx, 40(%rsp)
    movq    %rcx, 32(%rsp)
    movq    %rax, 24(%rsp)
    movq    L_c.x+16(%rip), %rax
    movq    %rax, 16(%rsp)
    movq    L_c.x+8(%rip), %rax
    movq    %rax, 8(%rsp)
    movq    L_c.x(%rip), %rax
    movq    %rax, (%rsp)
    callq    _b
    addq    $64, %rsp
    popq    %rbp
    ret
    .cfi_endproc

    .section    __TEXT,__const
    .align    3                       ## @c.x
L_c.x:
    .quad    1                       ## 0x1
    .quad    2                       ## 0x2
    .quad    3                       ## 0x3
    .quad    4                       ## 0x4
    .quad    5                       ## 0x5
    .quad    6                       ## 0x6
    .quad    7                       ## 0x7
    .quad    8                       ## 0x8
--->8---

note the useless copy in c, that could be elided by inlining.

One real-world example is c-ray, which relies heavily on a specific function
(ray_sphere) being inlined. This makes a performance difference of >20%.

$ wget http://www.phoronix-test-suite.com/benchmark-files/c-ray-1.1.tar.gz
$ tar xfz c-ray-1.1.tar.gz
$ cd c-ray-1.1
$ gcc-4.7 -O3 c-ray-mt.c -o c-ray-1
$ clang -O3 c-ray-mt.c -o c-ray-2
$ time ./c-ray-1 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm
./c-ray-1 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm  3.17s user 0.01s
system 383% cpu 0.829 total
$ time ./c-ray-2 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm
./c-ray-2 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm  4.25s user 0.01s
system 386% cpu 1.100 total

If I force inlining of ray_sphere time drops to 0.785 total.

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.