[LLVMdev] [DragonEgg] [Polly] Should we expect DragonEgg to produce identical LLVM IR for identical GIMPLE?
Dmitry Mikushin
dmitry at kernelgen.org
Mon Dec 31 10:06:28 PST 2012
Forgot to attach original sources (Fortran).
2012/12/31 Dmitry Mikushin <dmitry at kernelgen.org>
> Dear all,
>
> In our compiler we use a modified version LLVM Polly, which is very
> sensitive to proper code generation. Among the number of limitations, the
> loop region (enclosed by phi node on induction variable and branch) is
> required to be free of additional memory-dependent branches. In other
> words, there must be no conditional "br" instructions below phi nodes. The
> problem we are facing is that from *identical* GIMPLE for 3d loop used in
> different contexts DragonEgg may generate LLVM IR either conforming the
> described limitation, or violating it.
>
> Let's consider two examples. In first one some simple 3D loop is used
> directly in main program. In second - the same 3D loop is extracted into
> separate subroutine called from main. Attached source code and listings for
> GIMPLE and LLVM IR show that although GIMPLE codes are similar, by some
> reason in first case branching goes under phi nodes, making Polly to fail
> with parallelizing the resion. In second case everything is fine.
>
> I have not looked into DragonEgg internals enough yet, but before I'll do,
> the question is: should we expect DragonEgg to produce identical LLVM IRs
> for identical GIMPLEs? The case shown here is really really important. The
> success of parallelization utilities that are currently in a quite good
> shape (thanks to their developers!) nowadays heavily relies on the code
> quality: if IR is too rough, we can't do much about it :(
>
> Many thanks and Happy New Year!
> - Dima.
>
>
> 1) Bad LLVM IR:
> ===============
>
> marcusmae at M17xR4:~/forge/kernelgen/tests/behavior/demo_f$
> KERNELGEN_FALLBACK=1 kernelgen-gfortran -fplugin=dragonegg.so
> -fplugin-arg-dragonegg-emit-ir -fplugin-arg-dragonegg-llvm-ir-optimize=0 -S
> -c demo_f.f90 -o - | opt -O3 -S -o -
>
> "161.i": ; preds = %"160.i",
> %"159.i"
> call void bitcast (void (...)* @_gfortran_cpu_time_4 to void
> (float*)*)(float* %start.i) nounwind
> %204 = load i32* %ns.i, align 4
> %205 = icmp sgt i32 %204, 0
> br i1 %205, label %"162.preheader.i", label %"170.i"
>
> "162.preheader.i": ; preds = %"161.i"
> %206 = bitcast i8* %x.0.0.i to float*
> %207 = add i64 %y.3.2.0.0.i, %y.3.1.0.0.i
> %208 = bitcast i8* %142 to float*
> %.pre.i = load i32* %ny.i, align 4
> %209 = icmp sgt i32 %.pre.i, 0
> br label %"162.i"
>
> "162.i": ; preds = %"168.i",
> %"162.preheader.i"
> %210 = phi i32 [ %240, %"168.i" ], [ 1, %"162.preheader.i" ]
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
> br i1 %209, label %"163.preheader.i", label %"168.i"
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
>
> "163.preheader.i": ; preds = %"162.i"
> %211 = sext i32 %210 to i64
> %212 = mul i64 %211, %86
> %213 = mul i64 %211, %207
> %.pre118.i = load i32* %nx.i, align 4
> %214 = icmp sgt i32 %.pre118.i, 0
> %215 = add i64 %212, %98
> %216 = add i64 %213, %y.1.0.i
> br label %"163.i"
>
> "163.i": ; preds = %"166.i",
> %"163.preheader.i"
> %217 = phi i32 [ %238, %"166.i" ], [ 1, %"163.preheader.i" ]
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
> br i1 %214, label %"164.preheader.i", label %"166.i"
> ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES
> A PROBLEM
>
> "164.preheader.i": ; preds = %"163.i"
> %218 = sext i32 %217 to i64
> %219 = mul i64 %218, %72
> %220 = add i64 %215, %219
> br label %"164.i"
>
> "164.i": ; preds = %"164.i",
> %"164.preheader.i"
> %221 = phi i32 [ %236, %"164.i" ], [ 1, %"164.preheader.i" ]
> %222 = sext i32 %221 to i64
> %223 = add i64 %220, %222
> %224 = getelementptr float* %206, i64 %223
> %225 = load float* %224, align 4
> %226 = call float @sinf(float %225) nounwind readnone
> %227 = call float @asinf(float %226) nounwind readnone
> %228 = add i64 %216, %222
> %229 = getelementptr [0 x float]* %173, i64 0, i64 %228
> %230 = load float* %229, align 4
> %231 = call float @cosf(float %230) nounwind readnone
> %232 = call float @acosf(float %231) nounwind readnone
> %233 = fadd float %227, %232
> %234 = getelementptr float* %208, i64 %223
> store float %233, float* %234, align 4
> %235 = icmp eq i32 %221, %.pre118.i
> %236 = add i32 %221, 1
> br i1 %235, label %"166.i", label %"164.i"
>
> "166.i": ; preds = %"164.i",
> %"163.i"
> %237 = icmp eq i32 %217, %.pre.i
> %238 = add i32 %217, 1
> br i1 %237, label %"168.i", label %"163.i"
>
> "168.i": ; preds = %"166.i",
> %"162.i"
> %239 = icmp eq i32 %210, %204
> %240 = add i32 %210, 1
> br i1 %239, label %"170.i", label %"162.i"
>
> "170.i": ; preds = %"168.i",
> %"161.i"
> call void bitcast (void (...)* @_gfortran_cpu_time_4 to void
> (float*)*)(float* %finish.i) nounwind
> br i1 %37, label %"172.i", label %"171.i"
>
> GIMPLE:
> =======
>
> _gfortran_cpu_time_4 (&start);
> {
> integer(kind=4) D.1699;
>
> D.1699 = ns;
> k = 1;
> if (k <= D.1699) goto <D.2262>; else goto <D.2263>;
> <D.2262>:
> <D.2264>:
> {
> logical(kind=4) D.1710;
>
> {
> integer(kind=4) D.1702;
>
> D.1702 = ny;
> j = 1;
> if (j <= D.1702) goto <D.2265>; else goto <D.2266>;
> <D.2265>:
> <D.2267>:
> {
> logical(kind=4) D.1709;
>
> {
> integer(kind=4) D.1705;
>
> D.1705 = nx;
> i = 1;
> if (i <= D.1705) goto <D.2268>; else goto <D.2269>;
> <D.2268>:
> <D.2270>:
> {
> logical(kind=4) D.1708;
>
> D.2271 = xy.data;
> D.2272 = (integer(kind=8)) i;
> D.2273 = (integer(kind=8)) k;
> D.2274 = xy.dim[2].stride;
> D.2275 = D.2273 * D.2274;
> D.2276 = (integer(kind=8)) j;
> D.2277 = xy.dim[1].stride;
> D.2278 = D.2276 * D.2277;
> D.2279 = D.2275 + D.2278;
> D.2280 = D.2272 + D.2279;
> D.2281 = xy.offset;
> D.2282 = D.2280 + D.2281;
> D.2283 = x.data;
> D.2284 = (integer(kind=8)) i;
> D.2285 = (integer(kind=8)) k;
> D.2286 = x.dim[2].stride;
> D.2287 = D.2285 * D.2286;
> D.2288 = (integer(kind=8)) j;
> D.2289 = x.dim[1].stride;
> D.2290 = D.2288 * D.2289;
> D.2291 = D.2287 + D.2290;
> D.2292 = D.2284 + D.2291;
> D.2293 = x.offset;
> D.2294 = D.2292 + D.2293;
> D.2295 = MEM[(real(kind=4)[0:] *)D.2283][D.2294];
> D.2296 = __builtin_sinf (D.2295);
> D.2297 = __builtin_asinf (D.2296);
> D.2298 = y.data;
> D.2299 = (integer(kind=8)) i;
> D.2300 = y.dim[2].stride;
> D.2301 = y.dim[1].stride;
> D.2302 = D.2300 + D.2301;
> D.2303 = (integer(kind=8)) k;
> D.2304 = D.2302 * D.2303;
> D.2305 = D.2299 + D.2304;
> D.2306 = y.offset;
> D.2307 = D.2305 + D.2306;
> D.2308 = MEM[(real(kind=4)[0:] *)D.2298][D.2307];
> D.2309 = __builtin_cosf (D.2308);
> D.2310 = __builtin_acosf (D.2309);
> D.2311 = D.2297 + D.2310;
> MEM[(real(kind=4)[0:] *)D.2271][D.2282] = D.2311;
> L.18:
> D.1708 = i == D.1705;
> i = i + 1;
> if (D.1708 != 0) goto L.19; else goto <D.2312>;
> <D.2312>:
> }
> goto <D.2270>;
> <D.2269>:
> L.19:
> }
> L.16:
> D.1709 = j == D.1702;
> j = j + 1;
> if (D.1709 != 0) goto L.17; else goto <D.2313>;
> <D.2313>:
> }
> goto <D.2267>;
> <D.2266>:
> L.17:
> }
> L.14:
> D.1710 = k == D.1699;
> k = k + 1;
> if (D.1710 != 0) goto L.15; else goto <D.2314>;
> <D.2314>:
> }
> goto <D.2264>;
> <D.2263>:
> L.15:
> }
> _gfortran_cpu_time_4 (&finish);
>
> 2) Good LLVM IR:
> ================
>
>
> marcusmae at M17xR4:~/forge/kernelgen/tests/behavior/demo_f$
> KERNELGEN_FALLBACK=1 kernelgen-gfortran -fplugin=dragonegg.so
> -fplugin-arg-dragonegg-emit-ir -fplugin-arg-dragonegg-llvm-ir-optimize=0 -S
> -c demo_f_m.f90 -o - | opt -O3 -S -o -
>
> entry:
> %0 = load i32* %nx, align 4
> %1 = sext i32 %0 to i64
> %2 = icmp slt i64 %1, 0
> %3 = select i1 %2, i64 0, i64 %1
> %4 = load i32* %ny, align 4
> %5 = sext i32 %4 to i64
> %6 = mul i64 %3, %5
> %7 = icmp slt i64 %6, 0
> %8 = select i1 %7, i64 0, i64 %6
> %9 = load i32* %ns, align 4
> %not = xor i64 %3, -1
> %10 = sub i64 %not, %8
> %11 = icmp sgt i32 %9, 0
> br i1 %11, label %"3.preheader", label %return
>
> "3.preheader": ; preds = %entry
> %12 = icmp sgt i32 %4, 0
> %13 = icmp sgt i32 %0, 0
> %14 = add i64 %8, %3
> br i1 %12, label %"4.preheader.us", label %return
>
> "9.us": ; preds = %"
> 4.preheader.us", %"7.us.us"
> %15 = icmp eq i32 %17, %9
> %16 = add i32 %17, 1
> br i1 %15, label %return, label %"4.preheader.us"
>
> "4.preheader.us": ; preds = %"9.us",
> %"3.preheader"
> %17 = phi i32 [ %16, %"9.us" ], [ 1, %"3.preheader" ]
> %18 = sext i32 %17 to i64
> %19 = mul i64 %18, %8
> %20 = add i64 %19, %10
> %21 = mul i64 %18, %14
> %22 = add i64 %21, %10
> br i1 %13, label %"5.preheader.us.us", label %"9.us"
>
> "7.us.us": ; preds = %"5.us.us"
> %23 = icmp eq i32 %25, %4
> %24 = add i32 %25, 1
> br i1 %23, label %"9.us", label %"5.preheader.us.us"
>
> "5.preheader.us.us": ; preds = %"
> 4.preheader.us", %"7.us.us"
> %25 = phi i32 [ %24, %"7.us.us" ], [ 1, %"4.preheader.us" ]
> %26 = sext i32 %25 to i64
> %27 = mul i64 %26, %3
> %28 = add i64 %20, %27
> br label %"5.us.us"
>
> "5.us.us": ; preds = %"5.us.us", %"
> 5.preheader.us.us"
> %29 = phi i32 [ %44, %"5.us.us" ], [ 1, %"5.preheader.us.us" ]
> %30 = sext i32 %29 to i64
> %31 = add i64 %28, %30
> %32 = getelementptr [0 x float]* %x, i64 0, i64 %31
> %33 = load float* %32, align 4
> %34 = tail call float @sinf(float %33) nounwind readnone
> %35 = tail call float @asinf(float %34) nounwind readnone
> %36 = add i64 %22, %30
> %37 = getelementptr [0 x float]* %y, i64 0, i64 %36
> %38 = load float* %37, align 4
> %39 = tail call float @cosf(float %38) nounwind readnone
> %40 = tail call float @acosf(float %39) nounwind readnone
> %41 = fadd float %35, %40
> %42 = getelementptr [0 x float]* %xy, i64 0, i64 %31
> store float %41, float* %42, align 4
> %43 = icmp eq i32 %29, %0
> %44 = add i32 %29, 1
> br i1 %43, label %"7.us.us", label %"5.us.us"
>
> return: ; preds =
> %"3.preheader", %"9.us", %entry
> ret void
>
> GIMPLE:
> =======
>
> D.2413 = *nx;
> ubound.7 = (integer(kind=8)) D.2413;
> stride.9 = ubound.7;
> stride.9 = MAX_EXPR <stride.9, 0>;
> D.2414 = *ny;
> ubound.8 = (integer(kind=8)) D.2414;
> stride.11 = stride.9 * ubound.8;
> stride.11 = MAX_EXPR <stride.11, 0>;
> D.2415 = *ns;
> ubound.10 = (integer(kind=8)) D.2415;
> size.13 = stride.11 * ubound.10;
> size.13 = MAX_EXPR <size.13, 0>;
> D.1583 = size.13 + -1;
> size.127 = (bit_size_type) size.13;
> D.1584 = size.127 * 32;
> size.128 = (<unnamed-unsigned:64>) size.13;
> D.1585 = size.128 * 4;
> D.2418 = ~stride.9;
> offset.12 = D.2418 - stride.11;
> D.2419 = *nx;
> ubound.0 = (integer(kind=8)) D.2419;
> stride.2 = ubound.0;
> stride.2 = MAX_EXPR <stride.2, 0>;
> D.2420 = *ny;
> ubound.1 = (integer(kind=8)) D.2420;
> stride.4 = stride.2 * ubound.1;
> stride.4 = MAX_EXPR <stride.4, 0>;
> D.2421 = *ns;
> ubound.3 = (integer(kind=8)) D.2421;
> size.6 = stride.4 * ubound.3;
> size.6 = MAX_EXPR <size.6, 0>;
> D.1580 = size.6 + -1;
> size.129 = (bit_size_type) size.6;
> D.1581 = size.129 * 32;
> size.130 = (<unnamed-unsigned:64>) size.6;
> D.1582 = size.130 * 4;
> D.2424 = ~stride.2;
> offset.5 = D.2424 - stride.4;
> D.2425 = *nx;
> ubound.14 = (integer(kind=8)) D.2425;
> stride.16 = ubound.14;
> stride.16 = MAX_EXPR <stride.16, 0>;
> D.2426 = *ny;
> ubound.15 = (integer(kind=8)) D.2426;
> stride.18 = stride.16 * ubound.15;
> stride.18 = MAX_EXPR <stride.18, 0>;
> D.2427 = *ns;
> ubound.17 = (integer(kind=8)) D.2427;
> size.20 = stride.18 * ubound.17;
> size.20 = MAX_EXPR <size.20, 0>;
> D.1577 = size.20 + -1;
> size.131 = (bit_size_type) size.20;
> D.1578 = size.131 * 32;
> size.132 = (<unnamed-unsigned:64>) size.20;
> D.1579 = size.132 * 4;
> D.2430 = ~stride.16;
> offset.19 = D.2430 - stride.18;
> {
> integer(kind=4) D.1565;
>
> D.1565 = *ns;
> k = 1;
> if (k <= D.1565) goto <D.2431>; else goto <D.2432>;
> <D.2431>:
> <D.2433>:
> {
> logical(kind=4) D.1576;
>
> {
> integer(kind=4) D.1568;
>
> D.1568 = *ny;
> j = 1;
> if (j <= D.1568) goto <D.2434>; else goto <D.2435>;
> <D.2434>:
> <D.2436>:
> {
> logical(kind=4) D.1575;
>
> {
> integer(kind=4) D.1571;
>
> D.1571 = *nx;
> i = 1;
> if (i <= D.1571) goto <D.2437>; else goto <D.2438>;
> <D.2437>:
> <D.2439>:
> {
> logical(kind=4) D.1574;
>
> D.2440 = (integer(kind=8)) i;
> D.2441 = (integer(kind=8)) k;
> D.2442 = D.2441 * stride.11;
> D.2443 = (integer(kind=8)) j;
> D.2444 = D.2443 * stride.9;
> D.2445 = D.2442 + D.2444;
> D.2446 = D.2440 + D.2445;
> D.2447 = D.2446 + offset.12;
> D.2448 = (integer(kind=8)) i;
> D.2449 = (integer(kind=8)) k;
> D.2450 = D.2449 * stride.4;
> D.2451 = (integer(kind=8)) j;
> D.2452 = D.2451 * stride.2;
> D.2453 = D.2450 + D.2452;
> D.2454 = D.2448 + D.2453;
> D.2455 = D.2454 + offset.5;
> D.2456 = *x[D.2455];
> D.2457 = __builtin_sinf (D.2456);
> D.2458 = __builtin_asinf (D.2457);
> D.2459 = (integer(kind=8)) i;
> D.2460 = stride.18 + stride.16;
> D.2461 = (integer(kind=8)) k;
> D.2462 = D.2460 * D.2461;
> D.2463 = D.2459 + D.2462;
> D.2464 = D.2463 + offset.19;
> D.2465 = *y[D.2464];
> D.2466 = __builtin_cosf (D.2465);
> D.2467 = __builtin_acosf (D.2466);
> D.2468 = D.2458 + D.2467;
> *xy[D.2447] = D.2468;
> L.5:
> D.1574 = i == D.1571;
> i = i + 1;
> if (D.1574 != 0) goto L.6; else goto <D.2469>;
> <D.2469>:
> }
> goto <D.2439>;
> <D.2438>:
> L.6:
> }
> L.3:
> D.1575 = j == D.1568;
> j = j + 1;
> if (D.1575 != 0) goto L.4; else goto <D.2470>;
> <D.2470>:
> }
> goto <D.2436>;
> <D.2435>:
> L.4:
> }
> L.1:
> D.1576 = k == D.1565;
> k = k + 1;
> if (D.1576 != 0) goto L.2; else goto <D.2471>;
> <D.2471>:
> }
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121231/1b365ef3/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: demo_f.f90
Type: application/octet-stream
Size: 1524 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121231/1b365ef3/attachment.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: demo_f_m.f90
Type: application/octet-stream
Size: 1710 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121231/1b365ef3/attachment-0001.obj>
More information about the llvm-dev
mailing list