[LLVMdev] [Polly] Analysis of extra compile-time overhead for simple nested loops
Tobias Grosser
tobias at grosser.es
Thu Aug 15 21:44:02 PDT 2013
On 08/15/2013 03:32 AM, Star Tan wrote:
> Hi all,
Hi,
I tried to reproduce your findings, but could not do so.
> I have investigated the 6X extra compile-time overhead when Polly compiles the simple nestedloop benchmark in LLVM-testsuite. (http://188.40.87.11:8000/db_default/v4/nts/31?compare_to=28&baseline=28). Preliminary results show that such compile-time overhead is resulted by the complicated polly-dependence analysis. However, the key seems to be the polly-prepare pass, which introduces a large number of store instructions, and thus significantly complicating the polly-dependence pass.
>
>
> Let me show the results with a tiny code example:
>
>
> int main(int argc, char *argv[]) {
> int n = ((argc == 2) ? atoi(argv[1]) : 46);
> int a, b, x=0;
> for (a=0; a<n; a++)
> for (b=0; b<n; b++)
> x++;
> printf("%d\n", x);
> return(0);
> }
This one misses some includes. Also, it is more convenient if you attach
the actual C file you include.
> The basic LLVM IR code (produced by "clang -O1") is shown as:
> @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
> ; Function Attrs: nounwind uwtable
> define i32 @main(i32 %argc, i8** nocapture readonly %argv) {
> entry:
> %cmp = icmp eq i32 %argc, 2
> br i1 %cmp, label %cond.end, label %for.cond2.preheader.lr.ph
> cond.end:
> %arrayidx = getelementptr inbounds i8** %argv, i64 1
> %0 = load i8** %arrayidx, align 8
> %call = tail call i32 (i8*, ...)* bitcast (i32 (...)* @atoi to i32 (i8*, ...)*)(i8* %0) #3
> %cmp117 = icmp sgt i32 %call, 0
> br i1 %cmp117, label %for.cond2.preheader.lr.ph, label %for.end8
> for.cond2.preheader.lr.ph:
> %cond22 = phi i32 [ %call, %cond.end ], [ 46, %entry ]
> %cmp314 = icmp sgt i32 %cond22, 0
> br label %for.cond2.preheader
> for.cond2.preheader:
> %x.019 = phi i32 [ 0, %for.cond2.preheader.lr.ph ], [ %x.1.lcssa, %for.inc6 ]
> %a.018 = phi i32 [ 0, %for.cond2.preheader.lr.ph ], [ %inc7, %for.inc6 ]
> br i1 %cmp314, label %for.body4, label %for.inc6
> for.body4:
> %x.116 = phi i32 [ %inc, %for.body4 ], [ %x.019, %for.cond2.preheader ]
> %b.015 = phi i32 [ %inc5, %for.body4 ], [ 0, %for.cond2.preheader ]
> %inc = add nsw i32 %x.116, 1
> %inc5 = add nsw i32 %b.015, 1
> %cmp3 = icmp slt i32 %inc5, %cond22
> br i1 %cmp3, label %for.body4, label %for.inc6
> for.inc6:
> %x.1.lcssa = phi i32 [ %x.019, %for.cond2.preheader ], [ %inc, %for.body4 ]
> %inc7 = add nsw i32 %a.018, 1
> %cmp1 = icmp slt i32 %inc7, %cond22
> br i1 %cmp1, label %for.cond2.preheader, label %for.end8
> for.end8:
> %x.0.lcssa = phi i32 [ 0, %cond.end ], [ %x.1.lcssa, %for.inc6 ]
> %call9 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %x.0.lcssa) #3
> ret i32 0
> }
> declare i32 @atoi(...)
> declare i32 @printf(i8* nocapture readonly, ...)
This test case misses the target data. If you would attach the original
file, it would be easier to reproduce.
> Such code is very simple and there is no memory instruction at all, so the polly-dependence pass runs very fast on this code. Unfortunately, when we run "opt -load LLVMPolly.so" for this basic LLVM IR code, the polly-prepare pass would introduce a large number of store instructions like this:
Which passes and commands have you actually run? The command "opt -load
LLVMPolly.so" does not run the -polly-prepare pass.
Sorry for being picky here. Unfortunately, there are too many ways you
could have run such passes, that I am afraid I would not guess the right
one. And if we are running different experiments, it is difficult to
discuss them.
The way I propose to run the passes is as follows:
$ clang -O0 -S -emit-llvm -o out.ll
# Get the polly passes to that are run
$ polly-opt test.ll -O3 -polly -debug-passes=Structure
Pass Arguments: -datalayout -notti -basictti -x86tti -targetlibinfo
-no-aa -tbaa -basicaa -preverify -domtree -verify -mem2reg -instcombine
-simplifycfg -tailcallelim -simplifycfg -reassociate -domtree -loops
-loop-simplify -lcssa -loop-rotate -instcombine -scalar-evolution
-loop-simplify -lcssa -iv-users -polly-indvars -polly-prepare
-postdomtree -domfrontier -regions -scalar-evolution -polly-detect
-polly-independent -polly-analyze-ir -polly-scops -polly-dependences
-polly-opt-isl -polly-cloog -polly-codegen -simplifycfg -domtree -sroa
-early-cse -lower-expect
[...]
# Run everything up to the offending pass
$ polly-opt test.ll -targetlibinfo -no-aa -tbaa -basicaa -preverify
-domtree -verify -mem2reg -instcombine -simplifycfg -tailcallelim
-simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa
-loop-rotate -instcombine -scalar-evolution -loop-simplify -lcssa
-iv-users -polly-indvars -o out.preopt.ll -S
# Show scops with and without preopt
~$ polly-opt -basicaa -view-scops out.preopt.ll -disable-output
Writing '/tmp/scops-f3a2fa.dot'... done.
Running 'xdot.py' program... done.
~$ polly-opt -basicaa -polly-prepare -view-scops out.preopt.ll
-disable-output
Writing '/tmp/scops-37f35f.dot'... done.
Running 'xdot.py' program... done.
# Print the scop info:
$ polly-opt -basicaa -polly-prepare -polly-scops out.preopt.ll -analyze
# Time with and without preoptimization
~$ time polly-opt -basicaa -polly-prepare -polly-dependences
out.preopt.ll -disable-output
real 0m0.040s
user 0m0.032s
sys 0m0.004s
~$ time polly-opt -basicaa -polly-dependences out.preopt.ll -disable-output
real 0m0.011s
user 0m0.004s
sys 0m0.004s
Looking at the timings, this does in fact not look too bad. Run within -O3
~$ time polly-opt -basicaa -O3 out.ll -disable-output
real 0m0.015s
user 0m0.004s
sys 0m0.008s
~$ time polly-opt -basicaa -O3 out.ll -disable-output -polly
real 0m0.029s
user 0m0.024s
sys 0m0.004s
Within -O3 the overhead is still far away from the 60 slowdown we have
seen on the other test case. A 2x slowdown for a kernel that we fully
analyse and code generate is not optimal, but probably not an issue. The
question is if the increasing depth of the loop nest can yield this 60x
slowdown.
[..]
> These store instructions significantly complicate the "polly-dependence" pass, and thus leading to high compile-time overhead.
What do you mean by "complicated the 'polly-dependence' pass"? Without
-polly-prepare the scop that is detected is a lot smaller. Hence, the
two scops are not really comparable.
The point of the -polly-prepare pass is to transform some scalar
dependences into memory dependences. Introducing explicit load/store
instructions simplifies the later passes as scalar dependences do not
be handled specially. However, introducing store instructions does (or
should) not make the actual dependence analysis problem more
complicated. If Polly would be extended to detect SCoPs with scalar
dependences those scalar dependences should be the very same once that
are now created for the memory dependences introduced by the
-polly-prepare pass.
> I have noticed that such memory instructions are finally simplified to scalar operations by the SROA pass, so one possible way to reduce such compile-time overhead is to move the SROA pass ahead of polly-dependence analysis.
>
>
> Can anyone give me some hints that why the polly-prepare pass introduces such memory instructions? Is it possible to move the SROA pass ahead of polly-dependence analysis?
Moving the SROA pass is not the solution. It would just prevent Polly
from detecting some SCoPs.
I think there are two angles we can have a look at:
1) Does -polly-prepare introduce unneeded dependences?
This means, could we model the scalar dependences more efficiently than
how they are modeled by the array accesses introduced by -polly-prepare.
2) What triggers the 60x slowdown in dependence analysis
Is this due to the increasing loop nest depth? Or the increasing number
of allocations introduced by -polly-prepare?
An interesting experiment would be to see if the dependence analysis
runs faster on a loop that uses a single element array to implement the
reduction:
for (i
for (j
...
X[0] = ...
Meaning one alloc instruction and a single load and store in the
innermost loop, without any non-induction variable PHI nodes.
When implementing this in C, LLVM may introduce again PHI nodes. So it
may possibly be necessary to write this directly in LLVM-IR.
This became a long mail. Hope it gives you some ideas how to proceed.
Cheers,
Tobi
P.S: Sebastian, also thank you for your comments!
-------------- next part --------------
A non-text attachment was scrubbed...
Name: test.c
Type: text/x-csrc
Size: 195 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20130815/85bcf4bc/attachment.c>
-------------- next part --------------
; ModuleID = '/tmp/test.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@b = common global [2048 x i32] zeroinitializer, align 16
@a = common global [2048 x i32] zeroinitializer, align 16
; Function Attrs: noinline nounwind uwtable
define void @example7(i32 %x) #0 {
entry:
br label %entry.split
entry.split: ; preds = %entry
%0 = zext i32 %x to i64
br label %for.body
for.body: ; preds = %entry.split, %for.body
%indvar = phi i64 [ 0, %entry.split ], [ %indvar.next, %for.body ]
%1 = add i64 %0, %indvar
%add = trunc i64 %1 to i32
%arrayidx2 = getelementptr [2048 x i32]* @a, i64 0, i64 %indvar
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %1
%2 = load i32* %arrayidx, align 4
store i32 %2, i32* %arrayidx2, align 4
%indvar.next = add i64 %indvar, 1
%exitcond = icmp ne i64 %indvar.next, 1024
br i1 %exitcond, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-------------- next part --------------
; ModuleID = 'test.ll'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** %argv) #0 {
%1 = icmp eq i32 %argc, 2
br i1 %1, label %2, label %6
; <label>:2 ; preds = %0
%3 = getelementptr inbounds i8** %argv, i64 1
%4 = load i8** %3, align 8
%5 = tail call i32 (i8*, ...)* bitcast (i32 (...)* @atoi to i32 (i8*, ...)*)(i8* %4) #2
br label %6
; <label>:6 ; preds = %0, %2
%7 = phi i32 [ %5, %2 ], [ 46, %0 ]
%8 = icmp sgt i32 %7, 0
br i1 %8, label %.preheader.lr.ph, label %15
.preheader.lr.ph: ; preds = %6
br label %.preheader
.preheader: ; preds = %.preheader.lr.ph, %13
%x.04 = phi i32 [ 0, %.preheader.lr.ph ], [ %x.1.lcssa, %13 ]
%a.03 = phi i32 [ 0, %.preheader.lr.ph ], [ %14, %13 ]
%9 = icmp sgt i32 %7, 0
br i1 %9, label %.lr.ph, label %13
.lr.ph: ; preds = %.preheader
br label %10
; <label>:10 ; preds = %.lr.ph, %10
%b.01 = phi i32 [ 0, %.lr.ph ], [ %11, %10 ]
%11 = add nsw i32 %b.01, 1
%exitcond = icmp ne i32 %11, %7
br i1 %exitcond, label %10, label %._crit_edge
._crit_edge: ; preds = %10
%12 = add i32 %7, %x.04
br label %13
; <label>:13 ; preds = %._crit_edge, %.preheader
%x.1.lcssa = phi i32 [ %12, %._crit_edge ], [ %x.04, %.preheader ]
%14 = add nsw i32 %a.03, 1
%exitcond7 = icmp ne i32 %14, %7
br i1 %exitcond7, label %.preheader, label %._crit_edge5
._crit_edge5: ; preds = %13
%x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %13 ]
br label %15
; <label>:15 ; preds = %._crit_edge5, %6
%x.0.lcssa = phi i32 [ %x.1.lcssa.lcssa, %._crit_edge5 ], [ 0, %6 ]
%16 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %x.0.lcssa) #2
ret i32 0
}
declare i32 @atoi(...) #1
declare i32 @printf(i8*, ...) #1
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
More information about the llvm-dev
mailing list