[polly] r229423 - Update to isl 99d53692ba (Performance changes)

Tue Feb 17 02:00:46 PST 2015

On 16.02.2015 20:33, Tobias Grosser wrote:
> Author: grosser
> Date: Mon Fe 16 13:33:40 2015
> New Revision: 229423
>
> URL: http://llvm.org/viewvc/llvm-project?rev=229423&view=rev
> Log:
> Update to isl 99d53692ba
>
> This commit imports the latest isl version into lib/External/isl. The changes
> relavant for Polly are:
>
>    1) Schedule trees [1] have been introduced as a more structured way to
>       describe schedules. Polly does not yet use them, but we may switch to them
>       in the near future.
>    2) Another set of coalescing changes [2] simplifies some data dependences and
>       removes a couple of code generation artifacts.
>
>       We now understand that the following sets can be merged:
>
>       { Stmt_S1[i0, i1] -> Stmt_S2[i0 + i1] :
>            i0 >= 0 and i1 <= 1023 - i0 and i1 >= 1
>         Stmt_S1[i0, 0] -> Stmt_S2[i0] : i0 <= 1023 and i0 >= 1}
>
>       into:
>
>       { Stmt_S1[i0, i1] -> Stmt_S2[i0 + i1] : i1 <= 1023 - i0 and i1 >= 0 and
>                                               i1 >= 1 - i0 and i0 >= 0 }
>
>       Changes of this kind reduce unnecessary specialization during code
>       generation.
>
>       -  for (int c3 = 0; c3 <= 1023; c3 += 1) {
>       -    if (c3 % 2 == 0) {
>       -      Stmt_for_body3(c1, c3);
>       -    } else
>       -      Stmt_for_body3(c1, c3);
>       -  }
>       +  for (int c3 = 0; c3 <= 1023; c3 += 1)
>       +    Stmt_for_body3(c1, c3);

This commit improved compile time:

jacobi-2d-imper 	-47.40% 	2.9962 	1.5761
fdtd-2d 		-44.59% 	7.5665 	4.1923
3mm		 	-19.97% 	2.4042 	1.9241
whetstone 		-17.86% 	0.2240 	0.1840
jacobi-1d-imper 	-16.33% 	0.5880 	0.4920

We also have one larger compile time increase:

2mm 			 58.22% 	1.2161 	1.9241

The compile time increase is due to a reduction in dependence analysis
time for 2mm, which brings it below our compute out and enables
additional optimizations that reduce execution time:

2mm			-65.08% 	18.8892 	6.5964

I looked into the reasons for these compile time reductions. For
jacobi-2d-imper and fdtd-2d the code we generate is largely simplified:

jacobi-2d BEFORE:

     if (n >= 3)
       for (int c0 = 0; c0 < tsteps; c0 += 32)
         for (int c1 = 2 * c0; c1 <= min(2 * tsteps + n - 4, n + 2 * c0 
+ 60); c1 += 32) {
           if (2 * tsteps + n >= c1 + 5 && n + 2 * c0 + 59 >= c1) {
             for (int c2 = c1; c2 <= min(min(2 * tsteps + 2 * n - 8, 2 * 
n + 2 * c0 + 56), n + c1 + 28); c2 += 32) {
               if (2 * tsteps + n >= c1 + 6 && c1 + 2 >= n + 2 * c0 && n 
+ 2 * c0 + 58 >= c1 && n % 2 == 0)
                 for (int c5 = c2; c5 <= min(n + c1 - 3, c2 + 31); c5 += 1)
                   Stmt_for_body47(((-n + c1) / 2) + 1, n - 3, -c1 + c5);
               for (int c3 = max(max(c0, (c2 / 2) - n + 3), -n + 
floord(n + c1, 2) + 2); c3 <= min(min(tsteps - 1, c0 + 31), (c1 / 2) + 
15); c3 += 1) {
                 for (int c4 = max(max(c1, -n + c2 + 3), 2 * c3); c4 <= 
min(c1 + 31, n + 2 * c3 - 3); c4 += 1)
                   for (int c5 = max(c2, c4); c5 <= min(c2 + 31, n + c4 
- 3); c5 += 1) {
                     Stmt_for_body7(c3, -2 * c3 + c4, -c4 + c5);
                     if (c4 >= 2 * c3 + 1)
                       Stmt_for_body47(c3, -2 * c3 + c4 - 1, -c4 + c5);
                   }
                 if (c1 + 33 >= n + 2 * c3)
                   for (int c5 = max(c2, n + 2 * c3 - 2); c5 <= min(c2 + 
31, 2 * n + 2 * c3 - 5); c5 += 1)
                     Stmt_for_body47(c3, n - 3, -n - 2 * c3 + c5 + 2);
               }
             }
           } else if (c1 == n + 2 * c0 + 60 && (n + 2 * c0 + 60) % 32 == 
0) {
             for (int c2 = n + 2 * c0 + 60; c2 <= 2 * n + 2 * c0 + 57; 
c2 += 32)
               for (int c5 = c2; c5 <= min(2 * n + 2 * c0 + 57, c2 + 
31); c5 += 1)
                 Stmt_for_body47(c0 + 31, n - 3, -n - 2 * c0 + c5 - 60);
           } else
             for (int c2 = 2 * tsteps + n - 4; c2 < 2 * tsteps + 2 * n - 
6; c2 += 32)
               for (int c5 = c2; c5 <= min(2 * tsteps + 2 * n - 7, c2 + 
31); c5 += 1)
                 Stmt_for_body47(tsteps - 1, n - 3, -2 * tsteps - n + c5 
+ 4);
         }

jacobi-2d AFTER:

     for (int c0 = 0; c0 < tsteps; c0 += 32)
       for (int c1 = 2 * c0; c1 <= min(2 * tsteps + n - 4, n + 2 * c0 + 
60); c1 += 32)
         for (int c2 = c1; c2 <= min(min(2 * tsteps + 2 * n - 7, 2 * n + 
2 * c0 + 57), n + c1 + 28); c2 += 32)
           for (int c3 = max(max(c0, (c2 / 2) - n + 3), -n + floord(n + 
c1 + 1, 2) + 1); c3 <= min(min(tsteps - 1, c0 + 31), (c1 / 2) + 15); c3 
+= 1)
             for (int c4 = max(max(c1, -n + c2 + 3), 2 * c3); c4 <= 
min(c1 + 31, n + 2 * c3 - 2); c4 += 1)
               for (int c5 = max(c2, c4); c5 <= min(c2 + 31, n + c4 - 
3); c5 += 1) {
                 if (n + 2 * c3 >= c4 + 3)
                   Stmt_for_body7(c3, -2 * c3 + c4, -c4 + c5);
                 if (c4 >= 2 * c3 + 1)
                   Stmt_for_body47(c3, -2 * c3 + c4 - 1, -c4 + c5);
               }

For 3mm the reduced compile time is just due to isl computing faster. I
did not investigate the other results.

For details see:

http://llvm.org/perf/db_default/v4/nts/21258?num_comparison_runs=0&test_filter=&test_min_value_filter=&aggregation_fn=median&MW_confidence_lv=0.05&compare_to=21237&submit=Update

Cheers,
Tobias