Dear all,<br><br>In our compiler we use a modified version LLVM Polly, which is very sensitive to proper code generation. Among the number of limitations, the loop region (enclosed by phi node on induction variable and branch) is required to be free of additional memory-dependent branches. In other words, there must be no conditional "br" instructions below phi nodes. The problem we are facing is that from *identical* GIMPLE for 3d loop used in different contexts DragonEgg may generate LLVM IR either conforming the described limitation, or violating it.<br>
<br>Let's consider two examples. In first one some simple 3D loop is used directly in main program. In second - the same 3D loop is extracted into separate subroutine called from main. Attached source code and listings for GIMPLE and LLVM IR show that although GIMPLE codes are similar, by some reason in first case branching goes under phi nodes, making Polly to fail with parallelizing the resion. In second case everything is fine.<br>
<br>I have not looked into DragonEgg internals enough yet, but before I'll do, the question is: should we expect DragonEgg to produce identical LLVM IRs for identical GIMPLEs? The case shown here is really really important. The success of parallelization utilities that are currently in a quite good shape (thanks to their developers!) nowadays heavily relies on the code quality: if IR is too rough, we can't do much about it :(<br>
<br>Many thanks and Happy New Year!<br>- Dima.<br><br><br>1) Bad LLVM IR:<br>===============<br><br>marcusmae@M17xR4:~/forge/kernelgen/tests/behavior/demo_f$ KERNELGEN_FALLBACK=1 kernelgen-gfortran -fplugin=dragonegg.so -fplugin-arg-dragonegg-emit-ir -fplugin-arg-dragonegg-llvm-ir-optimize=0 -S -c demo_f.f90 -o - | opt -O3 -S -o -<br>
<br>"161.i": ; preds = %"160.i", %"159.i"<br> call void bitcast (void (...)* @_gfortran_cpu_time_4 to void (float*)*)(float* %start.i) nounwind<br> %204 = load i32* %ns.i, align 4<br>
%205 = icmp sgt i32 %204, 0<br> br i1 %205, label %"162.preheader.i", label %"170.i"<br><br>"162.preheader.i": ; preds = %"161.i"<br> %206 = bitcast i8* %x.0.0.i to float*<br>
%207 = add i64 %y.3.2.0.0.i, %y.3.1.0.0.i<br> %208 = bitcast i8* %142 to float*<br> %.pre.i = load i32* %ny.i, align 4<br> %209 = icmp sgt i32 %.pre.i, 0<br> br label %"162.i"<br><br>"162.i": ; preds = %"168.i", %"162.preheader.i"<br>
%210 = phi i32 [ %240, %"168.i" ], [ 1, %"162.preheader.i" ]<br>; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES A PROBLEM<br> br i1 %209, label %"163.preheader.i", label %"168.i"<br>
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES A PROBLEM<br><br>"163.preheader.i": ; preds = %"162.i"<br> %211 = sext i32 %210 to i64<br> %212 = mul i64 %211, %86<br>
%213 = mul i64 %211, %207<br> %.pre118.i = load i32* %nx.i, align 4<br> %214 = icmp sgt i32 %.pre118.i, 0<br> %215 = add i64 %212, %98<br> %216 = add i64 %213, %y.1.0.i<br> br label %"163.i"<br><br>"163.i": ; preds = %"166.i", %"163.preheader.i"<br>
%217 = phi i32 [ %238, %"166.i" ], [ 1, %"163.preheader.i" ]<br>; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES A PROBLEM<br> br i1 %214, label %"164.preheader.i", label %"166.i"<br>
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THIS BRANCH CREATES A PROBLEM<br><br>"164.preheader.i": ; preds = %"163.i"<br> %218 = sext i32 %217 to i64<br> %219 = mul i64 %218, %72<br>
%220 = add i64 %215, %219<br> br label %"164.i"<br><br>"164.i": ; preds = %"164.i", %"164.preheader.i"<br> %221 = phi i32 [ %236, %"164.i" ], [ 1, %"164.preheader.i" ]<br>
%222 = sext i32 %221 to i64<br> %223 = add i64 %220, %222<br> %224 = getelementptr float* %206, i64 %223<br> %225 = load float* %224, align 4<br> %226 = call float @sinf(float %225) nounwind readnone<br> %227 = call float @asinf(float %226) nounwind readnone<br>
%228 = add i64 %216, %222<br> %229 = getelementptr [0 x float]* %173, i64 0, i64 %228<br> %230 = load float* %229, align 4<br> %231 = call float @cosf(float %230) nounwind readnone<br> %232 = call float @acosf(float %231) nounwind readnone<br>
%233 = fadd float %227, %232<br> %234 = getelementptr float* %208, i64 %223<br> store float %233, float* %234, align 4<br> %235 = icmp eq i32 %221, %.pre118.i<br> %236 = add i32 %221, 1<br> br i1 %235, label %"166.i", label %"164.i"<br>
<br>"166.i": ; preds = %"164.i", %"163.i"<br> %237 = icmp eq i32 %217, %.pre.i<br> %238 = add i32 %217, 1<br> br i1 %237, label %"168.i", label %"163.i"<br>
<br>"168.i": ; preds = %"166.i", %"162.i"<br> %239 = icmp eq i32 %210, %204<br> %240 = add i32 %210, 1<br> br i1 %239, label %"170.i", label %"162.i"<br>
<br>"170.i": ; preds = %"168.i", %"161.i"<br> call void bitcast (void (...)* @_gfortran_cpu_time_4 to void (float*)*)(float* %finish.i) nounwind<br> br i1 %37, label %"172.i", label %"171.i"<br>
<br>GIMPLE:<br>=======<br><br> _gfortran_cpu_time_4 (&start);<br> {<br> integer(kind=4) D.1699;<br><br> D.1699 = ns;<br> k = 1;<br> if (k <= D.1699) goto <D.2262>; else goto <D.2263>;<br>
<D.2262>:<br> <D.2264>:<br> {<br> logical(kind=4) D.1710;<br><br> {<br> integer(kind=4) D.1702;<br><br> D.1702 = ny;<br> j = 1;<br>
if (j <= D.1702) goto <D.2265>; else goto <D.2266>;<br> <D.2265>:<br> <D.2267>:<br> {<br> logical(kind=4) D.1709;<br><br> {<br>
integer(kind=4) D.1705;<br><br> D.1705 = nx;<br> i = 1;<br> if (i <= D.1705) goto <D.2268>; else goto <D.2269>;<br> <D.2268>:<br>
<D.2270>:<br> {<br> logical(kind=4) D.1708;<br><br> D.2271 = xy.data;<br> D.2272 = (integer(kind=8)) i;<br> D.2273 = (integer(kind=8)) k;<br>
D.2274 = xy.dim[2].stride;<br> D.2275 = D.2273 * D.2274;<br> D.2276 = (integer(kind=8)) j;<br> D.2277 = xy.dim[1].stride;<br> D.2278 = D.2276 * D.2277;<br>
D.2279 = D.2275 + D.2278;<br> D.2280 = D.2272 + D.2279;<br> D.2281 = xy.offset;<br> D.2282 = D.2280 + D.2281;<br> D.2283 = x.data;<br>
D.2284 = (integer(kind=8)) i;<br> D.2285 = (integer(kind=8)) k;<br> D.2286 = x.dim[2].stride;<br> D.2287 = D.2285 * D.2286;<br> D.2288 = (integer(kind=8)) j;<br>
D.2289 = x.dim[1].stride;<br> D.2290 = D.2288 * D.2289;<br> D.2291 = D.2287 + D.2290;<br> D.2292 = D.2284 + D.2291;<br> D.2293 = x.offset;<br>
D.2294 = D.2292 + D.2293;<br> D.2295 = MEM[(real(kind=4)[0:] *)D.2283][D.2294];<br> D.2296 = __builtin_sinf (D.2295);<br> D.2297 = __builtin_asinf (D.2296);<br>
D.2298 = y.data;<br> D.2299 = (integer(kind=8)) i;<br> D.2300 = y.dim[2].stride;<br> D.2301 = y.dim[1].stride;<br> D.2302 = D.2300 + D.2301;<br>
D.2303 = (integer(kind=8)) k;<br> D.2304 = D.2302 * D.2303;<br> D.2305 = D.2299 + D.2304;<br> D.2306 = y.offset;<br> D.2307 = D.2305 + D.2306;<br>
D.2308 = MEM[(real(kind=4)[0:] *)D.2298][D.2307];<br> D.2309 = __builtin_cosf (D.2308);<br> D.2310 = __builtin_acosf (D.2309);<br> D.2311 = D.2297 + D.2310;<br>
MEM[(real(kind=4)[0:] *)D.2271][D.2282] = D.2311;<br> L.18:<br> D.1708 = i == D.1705;<br> i = i + 1;<br> if (D.1708 != 0) goto L.19; else goto <D.2312>;<br>
<D.2312>:<br> }<br> goto <D.2270>;<br> <D.2269>:<br> L.19:<br> }<br> L.16:<br> D.1709 = j == D.1702;<br>
j = j + 1;<br> if (D.1709 != 0) goto L.17; else goto <D.2313>;<br> <D.2313>:<br> }<br> goto <D.2267>;<br> <D.2266>:<br>
L.17:<br> }<br> L.14:<br> D.1710 = k == D.1699;<br> k = k + 1;<br> if (D.1710 != 0) goto L.15; else goto <D.2314>;<br> <D.2314>:<br>
}<br> goto <D.2264>;<br> <D.2263>:<br> L.15:<br> }<br> _gfortran_cpu_time_4 (&finish);<br><br>2) Good LLVM IR:<br>================<br><br><br>marcusmae@M17xR4:~/forge/kernelgen/tests/behavior/demo_f$ KERNELGEN_FALLBACK=1 kernelgen-gfortran -fplugin=dragonegg.so -fplugin-arg-dragonegg-emit-ir -fplugin-arg-dragonegg-llvm-ir-optimize=0 -S -c demo_f_m.f90 -o - | opt -O3 -S -o -<br>
<br>entry:<br> %0 = load i32* %nx, align 4<br> %1 = sext i32 %0 to i64<br> %2 = icmp slt i64 %1, 0<br> %3 = select i1 %2, i64 0, i64 %1<br> %4 = load i32* %ny, align 4<br> %5 = sext i32 %4 to i64<br> %6 = mul i64 %3, %5<br>
%7 = icmp slt i64 %6, 0<br> %8 = select i1 %7, i64 0, i64 %6<br> %9 = load i32* %ns, align 4<br> %not = xor i64 %3, -1<br> %10 = sub i64 %not, %8<br> %11 = icmp sgt i32 %9, 0<br> br i1 %11, label %"3.preheader", label %return<br>
<br>"3.preheader": ; preds = %entry<br> %12 = icmp sgt i32 %4, 0<br> %13 = icmp sgt i32 %0, 0<br> %14 = add i64 %8, %3<br> br i1 %12, label %"<a href="http://4.preheader.us">4.preheader.us</a>", label %return<br>
<br>"<a href="http://9.us">9.us</a>": ; preds = %"<a href="http://4.preheader.us">4.preheader.us</a>", %"<a href="http://7.us.us">7.us.us</a>"<br> %15 = icmp eq i32 %17, %9<br>
%16 = add i32 %17, 1<br> br i1 %15, label %return, label %"<a href="http://4.preheader.us">4.preheader.us</a>"<br><br>"<a href="http://4.preheader.us">4.preheader.us</a>": ; preds = %"<a href="http://9.us">9.us</a>", %"3.preheader"<br>
%17 = phi i32 [ %16, %"<a href="http://9.us">9.us</a>" ], [ 1, %"3.preheader" ]<br> %18 = sext i32 %17 to i64<br> %19 = mul i64 %18, %8<br> %20 = add i64 %19, %10<br> %21 = mul i64 %18, %14<br> %22 = add i64 %21, %10<br>
br i1 %13, label %"<a href="http://5.preheader.us.us">5.preheader.us.us</a>", label %"<a href="http://9.us">9.us</a>"<br><br>"<a href="http://7.us.us">7.us.us</a>": ; preds = %"<a href="http://5.us.us">5.us.us</a>"<br>
%23 = icmp eq i32 %25, %4<br> %24 = add i32 %25, 1<br> br i1 %23, label %"<a href="http://9.us">9.us</a>", label %"<a href="http://5.preheader.us.us">5.preheader.us.us</a>"<br><br>"<a href="http://5.preheader.us.us">5.preheader.us.us</a>": ; preds = %"<a href="http://4.preheader.us">4.preheader.us</a>", %"<a href="http://7.us.us">7.us.us</a>"<br>
%25 = phi i32 [ %24, %"<a href="http://7.us.us">7.us.us</a>" ], [ 1, %"<a href="http://4.preheader.us">4.preheader.us</a>" ]<br> %26 = sext i32 %25 to i64<br> %27 = mul i64 %26, %3<br> %28 = add i64 %20, %27<br>
br label %"<a href="http://5.us.us">5.us.us</a>"<br><br>"<a href="http://5.us.us">5.us.us</a>": ; preds = %"<a href="http://5.us.us">5.us.us</a>", %"<a href="http://5.preheader.us.us">5.preheader.us.us</a>"<br>
%29 = phi i32 [ %44, %"<a href="http://5.us.us">5.us.us</a>" ], [ 1, %"<a href="http://5.preheader.us.us">5.preheader.us.us</a>" ]<br> %30 = sext i32 %29 to i64<br> %31 = add i64 %28, %30<br> %32 = getelementptr [0 x float]* %x, i64 0, i64 %31<br>
%33 = load float* %32, align 4<br> %34 = tail call float @sinf(float %33) nounwind readnone<br> %35 = tail call float @asinf(float %34) nounwind readnone<br> %36 = add i64 %22, %30<br> %37 = getelementptr [0 x float]* %y, i64 0, i64 %36<br>
%38 = load float* %37, align 4<br> %39 = tail call float @cosf(float %38) nounwind readnone<br> %40 = tail call float @acosf(float %39) nounwind readnone<br> %41 = fadd float %35, %40<br> %42 = getelementptr [0 x float]* %xy, i64 0, i64 %31<br>
store float %41, float* %42, align 4<br> %43 = icmp eq i32 %29, %0<br> %44 = add i32 %29, 1<br> br i1 %43, label %"<a href="http://7.us.us">7.us.us</a>", label %"<a href="http://5.us.us">5.us.us</a>"<br>
<br>return: ; preds = %"3.preheader", %"<a href="http://9.us">9.us</a>", %entry<br> ret void<br><br>GIMPLE:<br>=======<br><br> D.2413 = *nx;<br> ubound.7 = (integer(kind=8)) D.2413;<br>
stride.9 = ubound.7;<br> stride.9 = MAX_EXPR <stride.9, 0>;<br> D.2414 = *ny;<br> ubound.8 = (integer(kind=8)) D.2414;<br> stride.11 = stride.9 * ubound.8;<br> stride.11 = MAX_EXPR <stride.11, 0>;<br> D.2415 = *ns;<br>
ubound.10 = (integer(kind=8)) D.2415;<br> size.13 = stride.11 * ubound.10;<br> size.13 = MAX_EXPR <size.13, 0>;<br> D.1583 = size.13 + -1;<br> size.127 = (bit_size_type) size.13;<br> D.1584 = size.127 * 32;<br>
size.128 = (<unnamed-unsigned:64>) size.13;<br> D.1585 = size.128 * 4;<br> D.2418 = ~stride.9;<br> offset.12 = D.2418 - stride.11;<br> D.2419 = *nx;<br> ubound.0 = (integer(kind=8)) D.2419;<br> stride.2 = ubound.0;<br>
stride.2 = MAX_EXPR <stride.2, 0>;<br> D.2420 = *ny;<br> ubound.1 = (integer(kind=8)) D.2420;<br> stride.4 = stride.2 * ubound.1;<br> stride.4 = MAX_EXPR <stride.4, 0>;<br> D.2421 = *ns;<br> ubound.3 = (integer(kind=8)) D.2421;<br>
size.6 = stride.4 * ubound.3;<br> size.6 = MAX_EXPR <size.6, 0>;<br> D.1580 = size.6 + -1;<br> size.129 = (bit_size_type) size.6;<br> D.1581 = size.129 * 32;<br> size.130 = (<unnamed-unsigned:64>) size.6;<br>
D.1582 = size.130 * 4;<br> D.2424 = ~stride.2;<br> offset.5 = D.2424 - stride.4;<br> D.2425 = *nx;<br> ubound.14 = (integer(kind=8)) D.2425;<br> stride.16 = ubound.14;<br> stride.16 = MAX_EXPR <stride.16, 0>;<br>
D.2426 = *ny;<br> ubound.15 = (integer(kind=8)) D.2426;<br> stride.18 = stride.16 * ubound.15;<br> stride.18 = MAX_EXPR <stride.18, 0>;<br> D.2427 = *ns;<br> ubound.17 = (integer(kind=8)) D.2427;<br> size.20 = stride.18 * ubound.17;<br>
size.20 = MAX_EXPR <size.20, 0>;<br> D.1577 = size.20 + -1;<br> size.131 = (bit_size_type) size.20;<br> D.1578 = size.131 * 32;<br> size.132 = (<unnamed-unsigned:64>) size.20;<br> D.1579 = size.132 * 4;<br>
D.2430 = ~stride.16;<br> offset.19 = D.2430 - stride.18;<br> {<br> integer(kind=4) D.1565;<br><br> D.1565 = *ns;<br> k = 1;<br> if (k <= D.1565) goto <D.2431>; else goto <D.2432>;<br> <D.2431>:<br>
<D.2433>:<br> {<br> logical(kind=4) D.1576;<br><br> {<br> integer(kind=4) D.1568;<br><br> D.1568 = *ny;<br> j = 1;<br> if (j <= D.1568) goto <D.2434>; else goto <D.2435>;<br>
<D.2434>:<br> <D.2436>:<br> {<br> logical(kind=4) D.1575;<br><br> {<br> integer(kind=4) D.1571;<br><br> D.1571 = *nx;<br> i = 1;<br> if (i <= D.1571) goto <D.2437>; else goto <D.2438>;<br>
<D.2437>:<br> <D.2439>:<br> {<br> logical(kind=4) D.1574;<br><br> D.2440 = (integer(kind=8)) i;<br> D.2441 = (integer(kind=8)) k;<br> D.2442 = D.2441 * stride.11;<br>
D.2443 = (integer(kind=8)) j;<br> D.2444 = D.2443 * stride.9;<br> D.2445 = D.2442 + D.2444;<br> D.2446 = D.2440 + D.2445;<br> D.2447 = D.2446 + offset.12;<br>
D.2448 = (integer(kind=8)) i;<br> D.2449 = (integer(kind=8)) k;<br> D.2450 = D.2449 * stride.4;<br> D.2451 = (integer(kind=8)) j;<br> D.2452 = D.2451 * stride.2;<br>
D.2453 = D.2450 + D.2452;<br> D.2454 = D.2448 + D.2453;<br> D.2455 = D.2454 + offset.5;<br> D.2456 = *x[D.2455];<br> D.2457 = __builtin_sinf (D.2456);<br> D.2458 = __builtin_asinf (D.2457);<br>
D.2459 = (integer(kind=8)) i;<br> D.2460 = stride.18 + stride.16;<br> D.2461 = (integer(kind=8)) k;<br> D.2462 = D.2460 * D.2461;<br> D.2463 = D.2459 + D.2462;<br>
D.2464 = D.2463 + offset.19;<br> D.2465 = *y[D.2464];<br> D.2466 = __builtin_cosf (D.2465);<br> D.2467 = __builtin_acosf (D.2466);<br> D.2468 = D.2458 + D.2467;<br>
*xy[D.2447] = D.2468;<br> L.5:<br> D.1574 = i == D.1571;<br> i = i + 1;<br> if (D.1574 != 0) goto L.6; else goto <D.2469>;<br> <D.2469>:<br>
}<br> goto <D.2439>;<br> <D.2438>:<br> L.6:<br> }<br> L.3:<br> D.1575 = j == D.1568;<br> j = j + 1;<br> if (D.1575 != 0) goto L.4; else goto <D.2470>;<br>
<D.2470>:<br> }<br> goto <D.2436>;<br> <D.2435>:<br> L.4:<br> }<br> L.1:<br> D.1576 = k == D.1565;<br> k = k + 1;<br> if (D.1576 != 0) goto L.2; else goto <D.2471>;<br>
<D.2471>:<br> }<br><br>