[llvm-dev] Loop Unroll

Wed May 27 04:42:57 PDT 2020

Thanks for the tip!

What is the D80619 you say? Where can I find it?

El mié., 27 may. 2020 a las 13:04, David Green (<David.Green at arm.com>)
escribió:

> Hello
>
> You can add -debug to (usually) get some more information as to what is
> going on. In this case, depending on your target, it won't actually tell
> you much because turning on unrollandjam also requires a target option at
> the moment. Try adding -allow-unroll-and-jam to override the targets
> preference and enable it.
>
> It will then tell you that the inner loop is more complex than it was
> expecting. Use loop rotate to turn that inner loop into a single block.
>
> So:
> clang -O0 -Xclang -disable-O0-optnone -g0  -emit-llvm -S -o unj.ll unj.c
> opt -mem2reg -simplifycfg -loop-rotate -instcombine unj.ll -S -o unj2.ll
> opt unj2.ll --loop-unroll-and-jam --unroll-and-jam-count=4
> -allow-unroll-and-jam
>
> Unfortunately it will then unroll and jam too much! The first i loop, the
> matrix multiply j loop _and_ the matrix multiple i loop all get unrolled
> and jammed when I tried it. Although I was using an n of 8, not 4. There is
> still an issue in unroll and jam where the loop info is not updated
> correctly, which may then cause it to crash. D80619 is a (second) fix for
> that, if we don't go with another solution.
>
> With a tripcount of 4 you will often end up just completely unrolling the
> loops.
> Dave
>
>
> From: llvm-dev <llvm-dev-bounces at lists.llvm.org> on behalf of legend xx
> via llvm-dev <llvm-dev at lists.llvm.org>
> Sent: 26 May 2020 18:55
> To: Florian Hahn <florian_hahn at apple.com>
> Cc: llvm-dev <llvm-dev at lists.llvm.org>
> Subject: Re: [llvm-dev] Loop Unroll
>
> Awesome, thanks!
>
> Now I have another question. I have a matrix multiplication code. This is
> my code:
>
> #include <stdio.h>
> #include <stdlib.h>
>
> #define n 4
>
> int main(int argc, char *argv[]) {
>     int i, j, k;
>
>     int A[n][n], B[n][n], C[n][n];
>     for(i=0;i<n;i++){
>         for(j=0;j<n;j++){
>            A[i][j] = 1;
>            B[i][j] = 2;
>            C[i][j] = 0;
>         }
>     }
>
>     for(i=0;i<n;i++){
>         for(j=0;j<n;j++){
>             for(k=0;k<n;k++){
>                 C[i][j]=(C[i][j]+(A[i][k]*B[k][j]));
>             }
>         }
>     }
>
>     return 0;
> }
>
>
> I tried over them the loop-unroll-and-jam pass. I run:
>
> $ clang -O0 -Xclang -disable-O0-optnone -emit-llvm mult.c -S -o mult.ll
> $ opt -O0 -S -mem2reg -simplifycfg -view-cfg mult.ll -o multopt.ll
> $ opt -O0 -S -mem2reg -simplifycfg --loop-unroll-and-jam
> --unroll-and-jam-count=4 -simplifycfg -view-cfg mult.ll -o
> mult-opt00-unroll4.ll
>
> I get the same CFG graph in two case (I attach it). Also, I tried with -O1
> level (opt -01 -loop-unroll-and-jam --unroll-and-jam-count=4), but I didn't
> get any differece.
>
> Why the pass --loop-unroll-and-jam does not work?
>
>
>
>
>
>
> El dom., 24 may. 2020 a las 14:36, Florian Hahn (<florian_hahn at apple.com>)
> escribió:
>
>
> On May 23, 2020, at 17:15, legend xx <legendaryxx7slh at gmail.com> wrote:
>
> This is my example (for.c):
>
> #include <stdio.h>
>
> int add(int a, int b) {
>     return a + b;
> }
>
> int main() {
>    int a, b, c, d;
>    a = 5;
>    b = 15;
>    c = add(a, b);
>    d = 0;
>    for(int i=0;i<16;i++)
>        d = add(c, d);
> }
>
> I run:
> $ clang -O0 -Xclang -disable-O0-optnone -emit-llvm for.c -S -o forO0.ll
> $ opt -O0 -S --loop-unroll --unroll-count=4 -view-cfg forO0.ll -o
> for-opt00-unroll4.ll
>
> And this is the LLVM IR code that I get:
>
> ; ModuleID = 'forO0.ll'
> source_filename = "for.c"
> target datalayout =
> "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @add(i32 %a, i32 %b) #0 {
> entry:
>   %a.addr = alloca i32, align 4
>   %b.addr = alloca i32, align 4
>   store i32 %a, i32* %a.addr, align 4
>   store i32 %b, i32* %b.addr, align 4
>   %0 = load i32, i32* %a.addr, align 4
>   %1 = load i32, i32* %b.addr, align 4
>   %add = add nsw i32 %0, %1
>   ret i32 %add
> }
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @main() #0 {
> entry:
>   %retval = alloca i32, align 4
>   %a = alloca i32, align 4
>   %b = alloca i32, align 4
>   %c = alloca i32, align 4
>   %d = alloca i32, align 4
>   %i = alloca i32, align 4
>   store i32 0, i32* %retval, align 4
>   store i32 5, i32* %a, align 4
>   store i32 15, i32* %b, align 4
>   %0 = load i32, i32* %a, align 4
>   %1 = load i32, i32* %b, align 4
>   %call = call i32 @add(i32 %0, i32 %1)
>   store i32 %call, i32* %c, align 4
>   store i32 0, i32* %d, align 4
>   store i32 0, i32* %i, align 4
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.inc.3,
> %entry
>   %2 = load i32, i32* %i, align 4
>   %cmp = icmp slt i32 %2, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %3 = load i32, i32* %c, align 4
>   %4 = load i32, i32* %d, align 4
>   %call1 = call i32 @add(i32 %3, i32 %4)
>   store i32 %call1, i32* %d, align 4
>   br label %for.inc
>
> for.inc:                                          ; preds = %for.body
>   %5 = load i32, i32* %i, align 4
>   %inc = add nsw i32 %5, 1
>   store i32 %inc, i32* %i, align 4
>   %6 = load i32, i32* %i, align 4
>   %cmp.1 = icmp slt i32 %6, 16
>   br i1 %cmp.1, label %for.body.1, label %for.end
>
> for.end:                                          ; preds = %for.inc.2,
> %for.inc.1, %for.inc, %for.cond
>   %7 = load i32, i32* %d, align 4
>   %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x
> i8], [20 x i8]* @.str, i64 0, i64 0), i32 %7)
>   %8 = load i32, i32* %retval, align 4
>   ret i32 %8
>
> for.body.1:                                       ; preds = %for.inc
>   %9 = load i32, i32* %c, align 4
>   %10 = load i32, i32* %d, align 4
>   %call1.1 = call i32 @add(i32 %9, i32 %10)
>   store i32 %call1.1, i32* %d, align 4
>   br label %for.inc.1
>
> for.inc.1:                                        ; preds = %for.body.1
>   %11 = load i32, i32* %i, align 4
>   %inc.1 = add nsw i32 %11, 1
>   store i32 %inc.1, i32* %i, align 4
>   %12 = load i32, i32* %i, align 4
>   %cmp.2 = icmp slt i32 %12, 16
>   br i1 %cmp.2, label %for.body.2, label %for.end
>
> for.body.2:                                       ; preds = %for.inc.1
>   %13 = load i32, i32* %c, align 4
>   %14 = load i32, i32* %d, align 4
>   %call1.2 = call i32 @add(i32 %13, i32 %14)
>   store i32 %call1.2, i32* %d, align 4
>   br label %for.inc.2
>
> for.inc.2:                                        ; preds = %for.body.2
>   %15 = load i32, i32* %i, align 4
>   %inc.2 = add nsw i32 %15, 1
>   store i32 %inc.2, i32* %i, align 4
>   %16 = load i32, i32* %i, align 4
>   %cmp.3 = icmp slt i32 %16, 16
>   br i1 %cmp.3, label %for.body.3, label %for.end
>
> for.body.3:                                       ; preds = %for.inc.2
>   %17 = load i32, i32* %c, align 4
>   %18 = load i32, i32* %d, align 4
>   %call1.3 = call i32 @add(i32 %17, i32 %18)
>   store i32 %call1.3, i32* %d, align 4
>   br label %for.inc.3
>
> for.inc.3:                                        ; preds = %for.body.3
>   %19 = load i32, i32* %i, align 4
>   %inc.3 = add nsw i32 %19, 1
>   store i32 %inc.3, i32* %i, align 4
>   br label %for.cond, !llvm.loop !2
> }
>
> declare dso_local i32 @printf(i8*, ...) #1
>
> attributes #0 = { noinline nounwind uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "frame-pointer"="all"
> "less-precise-fpmad"="false" "min-legal-vector-width"="0"
> "no-infs-fp-math"="false" "no-jump-tables"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
> attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "frame-pointer"="all"
> "less-precise-fpmad"="false" "no-infs-fp-math"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
>
> !llvm.module.flags = !{!0}
> !llvm.ident = !{!1}
>
> !0 = !{i32 1, !"wchar_size", i32 4}
> !1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git
> a3485301d4870f57590d7b69eed7959134a694ab)"}
> !2 = distinct !{!2, !3}
> !3 = !{!"llvm.loop.unroll.disable"}
>
>
> So my problem is:
> With unroll 4 on the loop with 16 bounds I should see one single block for
> the incrementation i=i+4, then 4 instructions for each previous one
> instruction, and the condition should check if i<16. This is the intuitive
> code. However, the incrementation that I get is i=i+1 and there are only 4
> blocks.
>
>
> Do you know why this happen?
>
> I think loop-unroll works as expected in your example, as you can see the
> copies of the unrolled loop blocks (for.body.X, for.inc.X). The reason this
> is not simplified to the single block you are expecting is the input for
> -loop-unroll: -loop-unroll gets the IR without any optimizations (-O0).
>
> For the expected result, you need to run a few additional passes before
> -loop-unroll to promote some of the loads/stores to registers and simplify
> the CFG of the input.  Running `opt -mem2reg -simplifycfg -loop-unroll
> -unroll-count=4 forO0.ll -S` should give you something like
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body.3,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body.3 ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body.3 ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   br label %for.body.1
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
>
> for.body.1:                                       ; preds = %for.body
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   br label %for.body.2
>
> for.body.2:                                       ; preds = %for.body.1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   br label %for.body.3
>
> for.body.3:                                       ; preds = %for.body.2
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
> }
>
> Note that there are still 4 copies of the body instead of a single one.
> Like many passes in LLVM, the loop-unroll pass focuses on performing one
> transformation (duplicating the loop body a number of times) and relies on
> other passes to clean-up/simplify the result. To fold the 4 copies of the
> body into a single block, you need another round of CFG simplifications.
> Running `opt -mem2reg -simplifycfg -loop-unroll -unroll-count=4
> -simplifycfg forO0.ll -S` produces the code below, which is what you are
> looking for IIUC.
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
> }
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200527/5d48207e/attachment.html>