[LLVMbugs] [Bug 13095] New: Give an inline cost bonus to byval functions
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Tue Jun 12 13:28:15 PDT 2012
http://llvm.org/bugs/show_bug.cgi?id=13095
Bug #: 13095
Summary: Give an inline cost bonus to byval functions
Product: libraries
Version: trunk
Platform: PC
OS/Version: All
Status: NEW
Severity: enhancement
Priority: P
Component: Interprocedural Optimizations
AssignedTo: unassignedbugs at nondot.org
ReportedBy: benny.kra at gmail.com
CC: chandlerc at gmail.com, llvmbugs at cs.uiuc.edu
Classification: Unclassified
A byval argument can be interpreted as a bundle of arguments. Inline cost gives
a bonus for every argument but it doesn't give a boost to byval arguments.
Take this simple example (it's a bit braindead so it can tickle the inliner,
real-world examples are more complicated)
----8<----
struct vec6 {
long a, b, c, d, e, f, g, h;
};
void stopinliner(int);
/*__attribute__((always_inline))*/ void b(struct vec6 x) {
stopinliner(x.a);
stopinliner(x.b);
stopinliner(x.c);
stopinliner(x.d);
stopinliner(x.e);
stopinliner(x.f);
stopinliner(x.g);
stopinliner(x.h);
stopinliner(x.a);
stopinliner(x.a);
stopinliner(x.a);
stopinliner(x.a);
}
void c(void) {
struct vec6 x = { 1, 2, 3, 4, 5, 6, 7, 8 };
b(x);
}
--->8---
clang -O3 generates
---8<---
_b: ## @b
.cfi_startproc
## BB#0: ## %entry
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
pushq %rbx
pushq %rax
Ltmp6:
.cfi_offset %rbx, -24
movl 16(%rbp), %ebx
movl %ebx, %edi
callq _stopinliner
movl 24(%rbp), %edi
callq _stopinliner
movl 32(%rbp), %edi
callq _stopinliner
movl 40(%rbp), %edi
callq _stopinliner
movl 48(%rbp), %edi
callq _stopinliner
movl 56(%rbp), %edi
callq _stopinliner
movl 64(%rbp), %edi
callq _stopinliner
movl 72(%rbp), %edi
callq _stopinliner
movl %ebx, %edi
callq _stopinliner
movl %ebx, %edi
callq _stopinliner
movl %ebx, %edi
callq _stopinliner
movl %ebx, %edi
addq $8, %rsp
popq %rbx
popq %rbp
jmp _stopinliner ## TAILCALL
.cfi_endproc
.globl _c
.align 4, 0x90
_c: ## @c
.cfi_startproc
## BB#0: ## %entry
pushq %rbp
Ltmp9:
.cfi_def_cfa_offset 16
Ltmp10:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp11:
.cfi_def_cfa_register %rbp
subq $64, %rsp
movq L_c.x+24(%rip), %rax
movq L_c.x+32(%rip), %rcx
movq L_c.x+40(%rip), %rdx
movq L_c.x+48(%rip), %rsi
movq L_c.x+56(%rip), %rdi
movq %rdi, 56(%rsp)
movq %rsi, 48(%rsp)
movq %rdx, 40(%rsp)
movq %rcx, 32(%rsp)
movq %rax, 24(%rsp)
movq L_c.x+16(%rip), %rax
movq %rax, 16(%rsp)
movq L_c.x+8(%rip), %rax
movq %rax, 8(%rsp)
movq L_c.x(%rip), %rax
movq %rax, (%rsp)
callq _b
addq $64, %rsp
popq %rbp
ret
.cfi_endproc
.section __TEXT,__const
.align 3 ## @c.x
L_c.x:
.quad 1 ## 0x1
.quad 2 ## 0x2
.quad 3 ## 0x3
.quad 4 ## 0x4
.quad 5 ## 0x5
.quad 6 ## 0x6
.quad 7 ## 0x7
.quad 8 ## 0x8
--->8---
note the useless copy in c, that could be elided by inlining.
One real-world example is c-ray, which relies heavily on a specific function
(ray_sphere) being inlined. This makes a performance difference of >20%.
$ wget http://www.phoronix-test-suite.com/benchmark-files/c-ray-1.1.tar.gz
$ tar xfz c-ray-1.1.tar.gz
$ cd c-ray-1.1
$ gcc-4.7 -O3 c-ray-mt.c -o c-ray-1
$ clang -O3 c-ray-mt.c -o c-ray-2
$ time ./c-ray-1 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm
./c-ray-1 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm 3.17s user 0.01s
system 383% cpu 0.829 total
$ time ./c-ray-2 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm
./c-ray-2 -t 32 -s 160x120 -r 8 -i sphfract -o output.ppm 4.25s user 0.01s
system 386% cpu 1.100 total
If I force inlining of ray_sphere time drops to 0.785 total.
--
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.
More information about the llvm-bugs
mailing list