[LLVMdev] Improving performance with optimization passes
Jon Harrop
jon at ffconsultancy.com
Thu Feb 19 11:44:29 PST 2009
On Thursday 19 February 2009 19:00:14 Jon Harrop wrote:
> I'm toying with benchmarks on my HLVM and am unable to get any performance
> improvement from optimization passes...
I just disassembled some of the IR before and after optimization. This example
function squares a complex number:
let zsqr(r, i) = (r*r - i*i, 2*r*i)
My compiler is generating:
define fastcc i32 @zsqr({ double, double }*, { double, double }) {
entry:
%2 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%3 = getelementptr { double, double }* %2, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %3
%4 = getelementptr { double, double }* %2, i32 0, i32 0 ; <double*>
[#uses=1]
%5 = load double* %4 ; <double> [#uses=1]
%6 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%7 = getelementptr { double, double }* %6, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %7
%8 = getelementptr { double, double }* %6, i32 0, i32 0 ; <double*>
[#uses=1]
%9 = load double* %8 ; <double> [#uses=1]
%10 = mul double %5, %9 ; <double> [#uses=1]
%11 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%12 = getelementptr { double, double }* %11, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %12
%13 = getelementptr { double, double }* %11, i32 0, i32 1 ; <double*>
[#uses=1]
%14 = load double* %13 ; <double> [#uses=1]
%15 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%16 = getelementptr { double, double }* %15, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %16
%17 = getelementptr { double, double }* %15, i32 0, i32 1 ; <double*>
[#uses=1]
%18 = load double* %17 ; <double> [#uses=1]
%19 = mul double %14, %18 ; <double> [#uses=1]
%20 = sub double %10, %19 ; <double> [#uses=1]
%21 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%22 = getelementptr { double, double }* %21, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %22
%23 = getelementptr { double, double }* %21, i32 0, i32 0 ; <double*>
[#uses=1]
%24 = load double* %23 ; <double> [#uses=1]
%25 = mul double 2.000000e+00, %24 ; <double> [#uses=1]
%26 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%27 = getelementptr { double, double }* %26, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %27
%28 = getelementptr { double, double }* %26, i32 0, i32 1 ; <double*>
[#uses=1]
%29 = load double* %28 ; <double> [#uses=1]
%30 = mul double %25, %29 ; <double> [#uses=1]
%31 = alloca { double, double } ; <{ double, double }*> [#uses=3]
%32 = getelementptr { double, double }* %31, i32 0, i32 0 ; <double*>
[#uses=1]
store double %20, double* %32
%33 = getelementptr { double, double }* %31, i32 0, i32 1 ; <double*>
[#uses=1]
store double %30, double* %33
%34 = getelementptr { double, double }* %31, i32 0 ; <{ double, double }*>
[#uses=1]
%35 = load { double, double }* %34 ; <{ double, double }> [#uses=1]
%36 = getelementptr { double, double }* %0, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %35, { double, double }* %36
ret i32 0
}
But those LLVM optimization passes only reduce it to:
define fastcc i32 @zsqr({ double, double }*, { double, double }) {
entry:
%2 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %2, align 8
%3 = getelementptr { double, double }* %2, i32 0, i32 0 ; <double*>
[#uses=1]
%4 = load double* %3, align 8 ; <double> [#uses=1]
%5 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %5, align 8
%6 = getelementptr { double, double }* %5, i32 0, i32 0 ; <double*>
[#uses=1]
%7 = load double* %6, align 8 ; <double> [#uses=1]
%8 = mul double %4, %7 ; <double> [#uses=1]
%9 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %9, align 8
%10 = getelementptr { double, double }* %9, i32 0, i32 1 ; <double*>
[#uses=1]
%11 = load double* %10, align 8 ; <double> [#uses=1]
%12 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %12, align 8
%13 = getelementptr { double, double }* %12, i32 0, i32 1 ; <double*>
[#uses=1]
%14 = load double* %13, align 8 ; <double> [#uses=1]
%15 = mul double %11, %14 ; <double> [#uses=1]
%16 = sub double %8, %15 ; <double> [#uses=1]
%17 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %17, align 8
%18 = getelementptr { double, double }* %17, i32 0, i32 0 ; <double*>
[#uses=1]
%19 = load double* %18, align 8 ; <double> [#uses=1]
%20 = mul double %19, 2.000000e+00 ; <double> [#uses=1]
%21 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %21, align 8
%22 = getelementptr { double, double }* %21, i32 0, i32 1 ; <double*>
[#uses=1]
%23 = load double* %22, align 8 ; <double> [#uses=1]
%24 = mul double %20, %23 ; <double> [#uses=1]
%25 = alloca { double, double } ; <{ double, double }*> [#uses=3]
%26 = getelementptr { double, double }* %25, i32 0, i32 0 ; <double*>
[#uses=1]
store double %16, double* %26, align 8
%27 = getelementptr { double, double }* %25, i32 0, i32 1 ; <double*>
[#uses=1]
store double %24, double* %27, align 8
%28 = load { double, double }* %25, align 8 ; <{ double, double }> [#uses=1]
store { double, double } %28, { double, double }* %0
ret i32 0
}
So the optimization passes are at least doing something but they are a long
way from generating optimal code. Does LLVM have any optimization passes that
would promote these structs out of the stack and replace the loads with
extractvalue instructions?
The ideal result is probably:
define fastcc i32 @zsqr({ double, double }*, { double, double }) {
entry:
%1 = extractvalue {double, double} %1, 0
%2 = extractvalue {double, double} %1, 1
%3 = mul double %1, %1
%4 = mul double %2, %2
%5 = sub double %3, %4
%6 = getelementptr { double, double }* %0, i32 0, i32 0
store double %5, double* %6, align 8
%7 = mul double %1, 2.0
%8 = mul double %7, %2
%9 = getelementptr { double, double }* %0, i32 0, i32 1
store double %8, double* %9, align 8
ret i32 0
}
--
Dr Jon Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?e
More information about the llvm-dev
mailing list