[LLVMdev] [DragonEgg] [Polly] Should we expect DragonEgg to produce identical LLVM IR for identical GIMPLE?

Mon Dec 31 10:06:28 PST 2012

Forgot to attach original sources (Fortran).

2012/12/31 Dmitry Mikushin <dmitry at kernelgen.org>

> Dear all,
>
> In our compiler we use a modified version LLVM Polly, which is very
> sensitive to proper code generation. Among the number of limitations, the
> loop region (enclosed by phi node on induction variable and branch) is
> required to be free of additional memory-dependent branches. In other
> words, there must be no conditional "br" instructions below phi nodes. The
> problem we are facing is that from *identical* GIMPLE for 3d loop used in
> different contexts DragonEgg may generate LLVM IR either conforming the
> described limitation, or violating it.
>
> Let's consider two examples. In first one some simple 3D loop is used
> directly in main program. In second - the same 3D loop is extracted into
> separate subroutine called from main. Attached source code and listings for
> GIMPLE and LLVM IR show that although GIMPLE codes are similar, by some
> reason in first case branching goes under phi nodes, making Polly to fail
> with parallelizing the resion. In second case everything is fine.
>
> I have not looked into DragonEgg internals enough yet, but before I'll do,
> the question is: should we expect DragonEgg to produce identical LLVM IRs
> for identical GIMPLEs? The case shown here is really really important. The
> success of parallelization utilities that are currently in a quite good
> shape (thanks to their developers!) nowadays heavily relies on the code
> quality: if IR is too rough, we can't do much about it :(
>
> Many thanks and Happy New Year!
> - Dima.
>
>
> 1) Bad LLVM IR:
> ===============
>
> marcusmae at M17xR4:~/forge/kernelgen/tests/behavior/demo_f$
> KERNELGEN_FALLBACK=1 kernelgen-gfortran -fplugin=dragonegg.so
> -fplugin-arg-dragonegg-emit-ir -fplugin-arg-dragonegg-llvm-ir-optimize=0 -S
> -c demo_f.f90 -o - | opt -O3 -S -o -
>
> "161.i":                                          ; preds = %"160.i",
> %"159.i"
>   call void bitcast (void (...)* @_gfortran_cpu_time_4 to void
> (float*)*)(float* %start.i) nounwind
>   %204 = load i32* %ns.i, align 4
>   %205 = icmp sgt i32 %204, 0
>   br i1 %205, label %"162.preheader.i", label %"170.i"
>
> "162.preheader.i":                                ; preds = %"161.i"
>   %206 = bitcast i8* %x.0.0.i to float*
>   %207 = add i64 %y.3.2.0.0.i, %y.3.1.0.0.i
>   %208 = bitcast i8* %142 to float*
>   %.pre.i = load i32* %ny.i, align 4
>   %209 = icmp sgt i32 %.pre.i, 0
>   br label %"162.i"
>
> "162.i":                                          ; preds = %"168.i",
> %"162.preheader.i"
>   %210 = phi i32 [ %240, %"168.i" ], [ 1, %"162.preheader.i" ]
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
>   br i1 %209, label %"163.preheader.i", label %"168.i"
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
>
> "163.preheader.i":                                ; preds = %"162.i"
>   %211 = sext i32 %210 to i64
>   %212 = mul i64 %211, %86
>   %213 = mul i64 %211, %207
>   %.pre118.i = load i32* %nx.i, align 4
>   %214 = icmp sgt i32 %.pre118.i, 0
>   %215 = add i64 %212, %98
>   %216 = add i64 %213, %y.1.0.i
>   br label %"163.i"
>
> "163.i":                                          ; preds = %"166.i",
> %"163.preheader.i"
>   %217 = phi i32 [ %238, %"166.i" ], [ 1, %"163.preheader.i" ]
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
>   br i1 %214, label %"164.preheader.i", label %"166.i"
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
>
> "164.preheader.i":                                ; preds = %"163.i"
>   %218 = sext i32 %217 to i64
>   %219 = mul i64 %218, %72
>   %220 = add i64 %215, %219
>   br label %"164.i"
>
> "164.i":                                          ; preds = %"164.i",
> %"164.preheader.i"
>   %221 = phi i32 [ %236, %"164.i" ], [ 1, %"164.preheader.i" ]
>   %222 = sext i32 %221 to i64
>   %223 = add i64 %220, %222
>   %224 = getelementptr float* %206, i64 %223
>   %225 = load float* %224, align 4
>   %226 = call float @sinf(float %225) nounwind readnone
>   %227 = call float @asinf(float %226) nounwind readnone
>   %228 = add i64 %216, %222
>   %229 = getelementptr [0 x float]* %173, i64 0, i64 %228
>   %230 = load float* %229, align 4
>   %231 = call float @cosf(float %230) nounwind readnone
>   %232 = call float @acosf(float %231) nounwind readnone
>   %233 = fadd float %227, %232
>   %234 = getelementptr float* %208, i64 %223
>   store float %233, float* %234, align 4
>   %235 = icmp eq i32 %221, %.pre118.i
>   %236 = add i32 %221, 1
>   br i1 %235, label %"166.i", label %"164.i"
>
> "166.i":                                          ; preds = %"164.i",
> %"163.i"
>   %237 = icmp eq i32 %217, %.pre.i
>   %238 = add i32 %217, 1
>   br i1 %237, label %"168.i", label %"163.i"
>
> "168.i":                                          ; preds = %"166.i",
> %"162.i"
>   %239 = icmp eq i32 %210, %204
>   %240 = add i32 %210, 1
>   br i1 %239, label %"170.i", label %"162.i"
>
> "170.i":                                          ; preds = %"168.i",
> %"161.i"
>   call void bitcast (void (...)* @_gfortran_cpu_time_4 to void
> (float*)*)(float* %finish.i) nounwind
>   br i1 %37, label %"172.i", label %"171.i"
>
> GIMPLE:
> =======
>
>         _gfortran_cpu_time_4 (&start);
>         {
>           integer(kind=4) D.1699;
>
>           D.1699 = ns;
>           k = 1;
>           if (k <= D.1699) goto <D.2262>; else goto <D.2263>;
>           <D.2262>:
>           <D.2264>:
>           {
>             logical(kind=4) D.1710;
>
>             {
>               integer(kind=4) D.1702;
>
>               D.1702 = ny;
>               j = 1;
>               if (j <= D.1702) goto <D.2265>; else goto <D.2266>;
>               <D.2265>:
>               <D.2267>:
>               {
>                 logical(kind=4) D.1709;
>
>                 {
>                   integer(kind=4) D.1705;
>
>                   D.1705 = nx;
>                   i = 1;
>                   if (i <= D.1705) goto <D.2268>; else goto <D.2269>;
>                   <D.2268>:
>                   <D.2270>:
>                   {
>                     logical(kind=4) D.1708;
>
>                     D.2271 = xy.data;
>                     D.2272 = (integer(kind=8)) i;
>                     D.2273 = (integer(kind=8)) k;
>                     D.2274 = xy.dim[2].stride;
>                     D.2275 = D.2273 * D.2274;
>                     D.2276 = (integer(kind=8)) j;
>                     D.2277 = xy.dim[1].stride;
>                     D.2278 = D.2276 * D.2277;
>                     D.2279 = D.2275 + D.2278;
>                     D.2280 = D.2272 + D.2279;
>                     D.2281 = xy.offset;
>                     D.2282 = D.2280 + D.2281;
>                     D.2283 = x.data;
>                     D.2284 = (integer(kind=8)) i;
>                     D.2285 = (integer(kind=8)) k;
>                     D.2286 = x.dim[2].stride;
>                     D.2287 = D.2285 * D.2286;
>                     D.2288 = (integer(kind=8)) j;
>                     D.2289 = x.dim[1].stride;
>                     D.2290 = D.2288 * D.2289;
>                     D.2291 = D.2287 + D.2290;
>                     D.2292 = D.2284 + D.2291;
>                     D.2293 = x.offset;
>                     D.2294 = D.2292 + D.2293;
>                     D.2295 = MEM[(real(kind=4)[0:] *)D.2283][D.2294];
>                     D.2296 = __builtin_sinf (D.2295);
>                     D.2297 = __builtin_asinf (D.2296);
>                     D.2298 = y.data;
>                     D.2299 = (integer(kind=8)) i;
>                     D.2300 = y.dim[2].stride;
>                     D.2301 = y.dim[1].stride;
>                     D.2302 = D.2300 + D.2301;
>                     D.2303 = (integer(kind=8)) k;
>                     D.2304 = D.2302 * D.2303;
>                     D.2305 = D.2299 + D.2304;
>                     D.2306 = y.offset;
>                     D.2307 = D.2305 + D.2306;
>                     D.2308 = MEM[(real(kind=4)[0:] *)D.2298][D.2307];
>                     D.2309 = __builtin_cosf (D.2308);
>                     D.2310 = __builtin_acosf (D.2309);
>                     D.2311 = D.2297 + D.2310;
>                     MEM[(real(kind=4)[0:] *)D.2271][D.2282] = D.2311;
>                     L.18:
>                     D.1708 = i == D.1705;
>                     i = i + 1;
>                     if (D.1708 != 0) goto L.19; else goto <D.2312>;
>                     <D.2312>:
>                   }
>                   goto <D.2270>;
>                   <D.2269>:
>                   L.19:
>                 }
>                 L.16:
>                 D.1709 = j == D.1702;
>                 j = j + 1;
>                 if (D.1709 != 0) goto L.17; else goto <D.2313>;
>                 <D.2313>:
>               }
>               goto <D.2267>;
>               <D.2266>:
>               L.17:
>             }
>             L.14:
>             D.1710 = k == D.1699;
>             k = k + 1;
>             if (D.1710 != 0) goto L.15; else goto <D.2314>;
>             <D.2314>:
>           }
>           goto <D.2264>;
>           <D.2263>:
>           L.15:
>         }
>         _gfortran_cpu_time_4 (&finish);
>
> 2) Good LLVM IR:
> ================
>
>
> marcusmae at M17xR4:~/forge/kernelgen/tests/behavior/demo_f$
> KERNELGEN_FALLBACK=1 kernelgen-gfortran -fplugin=dragonegg.so
> -fplugin-arg-dragonegg-emit-ir -fplugin-arg-dragonegg-llvm-ir-optimize=0 -S
> -c demo_f_m.f90 -o - | opt -O3 -S -o -
>
> entry:
>   %0 = load i32* %nx, align 4
>   %1 = sext i32 %0 to i64
>   %2 = icmp slt i64 %1, 0
>   %3 = select i1 %2, i64 0, i64 %1
>   %4 = load i32* %ny, align 4
>   %5 = sext i32 %4 to i64
>   %6 = mul i64 %3, %5
>   %7 = icmp slt i64 %6, 0
>   %8 = select i1 %7, i64 0, i64 %6
>   %9 = load i32* %ns, align 4
>   %not = xor i64 %3, -1
>   %10 = sub i64 %not, %8
>   %11 = icmp sgt i32 %9, 0
>   br i1 %11, label %"3.preheader", label %return
>
> "3.preheader":                                    ; preds = %entry
>   %12 = icmp sgt i32 %4, 0
>   %13 = icmp sgt i32 %0, 0
>   %14 = add i64 %8, %3
>   br i1 %12, label %"4.preheader.us", label %return
>
> "9.us":                                           ; preds = %"
> 4.preheader.us", %"7.us.us"
>   %15 = icmp eq i32 %17, %9
>   %16 = add i32 %17, 1
>   br i1 %15, label %return, label %"4.preheader.us"
>
> "4.preheader.us":                                 ; preds = %"9.us",
> %"3.preheader"
>   %17 = phi i32 [ %16, %"9.us" ], [ 1, %"3.preheader" ]
>   %18 = sext i32 %17 to i64
>   %19 = mul i64 %18, %8
>   %20 = add i64 %19, %10
>   %21 = mul i64 %18, %14
>   %22 = add i64 %21, %10
>   br i1 %13, label %"5.preheader.us.us", label %"9.us"
>
> "7.us.us":                                        ; preds = %"5.us.us"
>   %23 = icmp eq i32 %25, %4
>   %24 = add i32 %25, 1
>   br i1 %23, label %"9.us", label %"5.preheader.us.us"
>
> "5.preheader.us.us":                              ; preds = %"
> 4.preheader.us", %"7.us.us"
>   %25 = phi i32 [ %24, %"7.us.us" ], [ 1, %"4.preheader.us" ]
>   %26 = sext i32 %25 to i64
>   %27 = mul i64 %26, %3
>   %28 = add i64 %20, %27
>   br label %"5.us.us"
>
> "5.us.us":                                        ; preds = %"5.us.us", %"
> 5.preheader.us.us"
>   %29 = phi i32 [ %44, %"5.us.us" ], [ 1, %"5.preheader.us.us" ]
>   %30 = sext i32 %29 to i64
>   %31 = add i64 %28, %30
>   %32 = getelementptr [0 x float]* %x, i64 0, i64 %31
>   %33 = load float* %32, align 4
>   %34 = tail call float @sinf(float %33) nounwind readnone
>   %35 = tail call float @asinf(float %34) nounwind readnone
>   %36 = add i64 %22, %30
>   %37 = getelementptr [0 x float]* %y, i64 0, i64 %36
>   %38 = load float* %37, align 4
>   %39 = tail call float @cosf(float %38) nounwind readnone
>   %40 = tail call float @acosf(float %39) nounwind readnone
>   %41 = fadd float %35, %40
>   %42 = getelementptr [0 x float]* %xy, i64 0, i64 %31
>   store float %41, float* %42, align 4
>   %43 = icmp eq i32 %29, %0
>   %44 = add i32 %29, 1
>   br i1 %43, label %"7.us.us", label %"5.us.us"
>
> return:                                           ; preds =
> %"3.preheader", %"9.us", %entry
>   ret void
>
> GIMPLE:
> =======
>
>   D.2413 = *nx;
>   ubound.7 = (integer(kind=8)) D.2413;
>   stride.9 = ubound.7;
>   stride.9 = MAX_EXPR <stride.9, 0>;
>   D.2414 = *ny;
>   ubound.8 = (integer(kind=8)) D.2414;
>   stride.11 = stride.9 * ubound.8;
>   stride.11 = MAX_EXPR <stride.11, 0>;
>   D.2415 = *ns;
>   ubound.10 = (integer(kind=8)) D.2415;
>   size.13 = stride.11 * ubound.10;
>   size.13 = MAX_EXPR <size.13, 0>;
>   D.1583 = size.13 + -1;
>   size.127 = (bit_size_type) size.13;
>   D.1584 = size.127 * 32;
>   size.128 = (<unnamed-unsigned:64>) size.13;
>   D.1585 = size.128 * 4;
>   D.2418 = ~stride.9;
>   offset.12 = D.2418 - stride.11;
>   D.2419 = *nx;
>   ubound.0 = (integer(kind=8)) D.2419;
>   stride.2 = ubound.0;
>   stride.2 = MAX_EXPR <stride.2, 0>;
>   D.2420 = *ny;
>   ubound.1 = (integer(kind=8)) D.2420;
>   stride.4 = stride.2 * ubound.1;
>   stride.4 = MAX_EXPR <stride.4, 0>;
>   D.2421 = *ns;
>   ubound.3 = (integer(kind=8)) D.2421;
>   size.6 = stride.4 * ubound.3;
>   size.6 = MAX_EXPR <size.6, 0>;
>   D.1580 = size.6 + -1;
>   size.129 = (bit_size_type) size.6;
>   D.1581 = size.129 * 32;
>   size.130 = (<unnamed-unsigned:64>) size.6;
>   D.1582 = size.130 * 4;
>   D.2424 = ~stride.2;
>   offset.5 = D.2424 - stride.4;
>   D.2425 = *nx;
>   ubound.14 = (integer(kind=8)) D.2425;
>   stride.16 = ubound.14;
>   stride.16 = MAX_EXPR <stride.16, 0>;
>   D.2426 = *ny;
>   ubound.15 = (integer(kind=8)) D.2426;
>   stride.18 = stride.16 * ubound.15;
>   stride.18 = MAX_EXPR <stride.18, 0>;
>   D.2427 = *ns;
>   ubound.17 = (integer(kind=8)) D.2427;
>   size.20 = stride.18 * ubound.17;
>   size.20 = MAX_EXPR <size.20, 0>;
>   D.1577 = size.20 + -1;
>   size.131 = (bit_size_type) size.20;
>   D.1578 = size.131 * 32;
>   size.132 = (<unnamed-unsigned:64>) size.20;
>   D.1579 = size.132 * 4;
>   D.2430 = ~stride.16;
>   offset.19 = D.2430 - stride.18;
>   {
>     integer(kind=4) D.1565;
>
>     D.1565 = *ns;
>     k = 1;
>     if (k <= D.1565) goto <D.2431>; else goto <D.2432>;
>     <D.2431>:
>     <D.2433>:
>     {
>       logical(kind=4) D.1576;
>
>       {
>         integer(kind=4) D.1568;
>
>         D.1568 = *ny;
>         j = 1;
>         if (j <= D.1568) goto <D.2434>; else goto <D.2435>;
>         <D.2434>:
>         <D.2436>:
>         {
>           logical(kind=4) D.1575;
>
>           {
>             integer(kind=4) D.1571;
>
>             D.1571 = *nx;
>             i = 1;
>             if (i <= D.1571) goto <D.2437>; else goto <D.2438>;
>             <D.2437>:
>             <D.2439>:
>             {
>               logical(kind=4) D.1574;
>
>               D.2440 = (integer(kind=8)) i;
>               D.2441 = (integer(kind=8)) k;
>               D.2442 = D.2441 * stride.11;
>               D.2443 = (integer(kind=8)) j;
>               D.2444 = D.2443 * stride.9;
>               D.2445 = D.2442 + D.2444;
>               D.2446 = D.2440 + D.2445;
>               D.2447 = D.2446 + offset.12;
>               D.2448 = (integer(kind=8)) i;
>               D.2449 = (integer(kind=8)) k;
>               D.2450 = D.2449 * stride.4;
>               D.2451 = (integer(kind=8)) j;
>               D.2452 = D.2451 * stride.2;
>               D.2453 = D.2450 + D.2452;
>               D.2454 = D.2448 + D.2453;
>               D.2455 = D.2454 + offset.5;
>               D.2456 = *x[D.2455];
>               D.2457 = __builtin_sinf (D.2456);
>               D.2458 = __builtin_asinf (D.2457);
>               D.2459 = (integer(kind=8)) i;
>               D.2460 = stride.18 + stride.16;
>               D.2461 = (integer(kind=8)) k;
>               D.2462 = D.2460 * D.2461;
>               D.2463 = D.2459 + D.2462;
>               D.2464 = D.2463 + offset.19;
>               D.2465 = *y[D.2464];
>               D.2466 = __builtin_cosf (D.2465);
>               D.2467 = __builtin_acosf (D.2466);
>               D.2468 = D.2458 + D.2467;
>               *xy[D.2447] = D.2468;
>               L.5:
>               D.1574 = i == D.1571;
>               i = i + 1;
>               if (D.1574 != 0) goto L.6; else goto <D.2469>;
>               <D.2469>:
>             }
>             goto <D.2439>;
>             <D.2438>:
>             L.6:
>           }
>           L.3:
>           D.1575 = j == D.1568;
>           j = j + 1;
>           if (D.1575 != 0) goto L.4; else goto <D.2470>;
>           <D.2470>:
>         }
>         goto <D.2436>;
>         <D.2435>:
>         L.4:
>       }
>       L.1:
>       D.1576 = k == D.1565;
>       k = k + 1;
>       if (D.1576 != 0) goto L.2; else goto <D.2471>;
>       <D.2471>:
>     }
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121231/1b365ef3/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: demo_f.f90
Type: application/octet-stream
Size: 1524 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121231/1b365ef3/attachment.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: demo_f_m.f90
Type: application/octet-stream
Size: 1710 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121231/1b365ef3/attachment-0001.obj>