[llvm-dev] Loop Unroll

Tue May 26 10:55:41 PDT 2020

Awesome, thanks!

Now I have another question. I have a matrix multiplication code. This is
my code:

#include <stdio.h>
#include <stdlib.h>

#define n 4

int main(int argc, char *argv[]) {
    int i, j, k;

    int A[n][n], B[n][n], C[n][n];
    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
           A[i][j] = 1;
           B[i][j] = 2;
           C[i][j] = 0;
        }
    }

    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
            for(k=0;k<n;k++){
                C[i][j]=(C[i][j]+(A[i][k]*B[k][j]));
            }
        }
    }

    return 0;
}

I tried over them the loop-unroll-and-jam pass. I run:

$ clang -O0 -Xclang -disable-O0-optnone -emit-llvm mult.c -S -o mult.ll
$ opt -O0 -S -mem2reg -simplifycfg -view-cfg mult.ll -o multopt.ll
$ opt -O0 -S -mem2reg -simplifycfg --loop-unroll-and-jam
--unroll-and-jam-count=4 -simplifycfg -view-cfg mult.ll -o
mult-opt00-unroll4.ll

I get the same CFG graph in two case (I attach it). Also, I tried with -O1
level (opt -01 -loop-unroll-and-jam --unroll-and-jam-count=4), but I didn't
get any differece.

Why the pass --loop-unroll-and-jam does not work?

El dom., 24 may. 2020 a las 14:36, Florian Hahn (<florian_hahn at apple.com>)
escribió:

>
>
> On May 23, 2020, at 17:15, legend xx <legendaryxx7slh at gmail.com> wrote:
>
> This is my example (for.c):
>
> #include <stdio.h>
>
> int add(int a, int b) {
>     return a + b;
> }
>
> int main() {
>    int a, b, c, d;
>    a = 5;
>    b = 15;
>    c = add(a, b);
>    d = 0;
>    for(int i=0;i<16;i++)
>        d = add(c, d);
> }
>
> I run:
> $ clang -O0 -Xclang -disable-O0-optnone -emit-llvm for.c -S -o forO0.ll
> $ opt -O0 -S --loop-unroll --unroll-count=4 -view-cfg forO0.ll -o
> for-opt00-unroll4.ll
>
> And this is the LLVM IR code that I get:
>
> ; ModuleID = 'forO0.ll'
> source_filename = "for.c"
> target datalayout =
> "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @add(i32 %a, i32 %b) #0 {
> entry:
>   %a.addr = alloca i32, align 4
>   %b.addr = alloca i32, align 4
>   store i32 %a, i32* %a.addr, align 4
>   store i32 %b, i32* %b.addr, align 4
>   %0 = load i32, i32* %a.addr, align 4
>   %1 = load i32, i32* %b.addr, align 4
>   %add = add nsw i32 %0, %1
>   ret i32 %add
> }
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @main() #0 {
> entry:
>   %retval = alloca i32, align 4
>   %a = alloca i32, align 4
>   %b = alloca i32, align 4
>   %c = alloca i32, align 4
>   %d = alloca i32, align 4
>   %i = alloca i32, align 4
>   store i32 0, i32* %retval, align 4
>   store i32 5, i32* %a, align 4
>   store i32 15, i32* %b, align 4
>   %0 = load i32, i32* %a, align 4
>   %1 = load i32, i32* %b, align 4
>   %call = call i32 @add(i32 %0, i32 %1)
>   store i32 %call, i32* %c, align 4
>   store i32 0, i32* %d, align 4
>   store i32 0, i32* %i, align 4
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.inc.3,
> %entry
>   %2 = load i32, i32* %i, align 4
>   %cmp = icmp slt i32 %2, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %3 = load i32, i32* %c, align 4
>   %4 = load i32, i32* %d, align 4
>   %call1 = call i32 @add(i32 %3, i32 %4)
>   store i32 %call1, i32* %d, align 4
>   br label %for.inc
>
> for.inc:                                          ; preds = %for.body
>   %5 = load i32, i32* %i, align 4
>   %inc = add nsw i32 %5, 1
>   store i32 %inc, i32* %i, align 4
>   %6 = load i32, i32* %i, align 4
>   %cmp.1 = icmp slt i32 %6, 16
>   br i1 %cmp.1, label %for.body.1, label %for.end
>
> for.end:                                          ; preds = %for.inc.2,
> %for.inc.1, %for.inc, %for.cond
>   %7 = load i32, i32* %d, align 4
>   %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x
> i8], [20 x i8]* @.str, i64 0, i64 0), i32 %7)
>   %8 = load i32, i32* %retval, align 4
>   ret i32 %8
>
> for.body.1:                                       ; preds = %for.inc
>   %9 = load i32, i32* %c, align 4
>   %10 = load i32, i32* %d, align 4
>   %call1.1 = call i32 @add(i32 %9, i32 %10)
>   store i32 %call1.1, i32* %d, align 4
>   br label %for.inc.1
>
> for.inc.1:                                        ; preds = %for.body.1
>   %11 = load i32, i32* %i, align 4
>   %inc.1 = add nsw i32 %11, 1
>   store i32 %inc.1, i32* %i, align 4
>   %12 = load i32, i32* %i, align 4
>   %cmp.2 = icmp slt i32 %12, 16
>   br i1 %cmp.2, label %for.body.2, label %for.end
>
> for.body.2:                                       ; preds = %for.inc.1
>   %13 = load i32, i32* %c, align 4
>   %14 = load i32, i32* %d, align 4
>   %call1.2 = call i32 @add(i32 %13, i32 %14)
>   store i32 %call1.2, i32* %d, align 4
>   br label %for.inc.2
>
> for.inc.2:                                        ; preds = %for.body.2
>   %15 = load i32, i32* %i, align 4
>   %inc.2 = add nsw i32 %15, 1
>   store i32 %inc.2, i32* %i, align 4
>   %16 = load i32, i32* %i, align 4
>   %cmp.3 = icmp slt i32 %16, 16
>   br i1 %cmp.3, label %for.body.3, label %for.end
>
> for.body.3:                                       ; preds = %for.inc.2
>   %17 = load i32, i32* %c, align 4
>   %18 = load i32, i32* %d, align 4
>   %call1.3 = call i32 @add(i32 %17, i32 %18)
>   store i32 %call1.3, i32* %d, align 4
>   br label %for.inc.3
>
> for.inc.3:                                        ; preds = %for.body.3
>   %19 = load i32, i32* %i, align 4
>   %inc.3 = add nsw i32 %19, 1
>   store i32 %inc.3, i32* %i, align 4
>   br label %for.cond, !llvm.loop !2
> }
>
> declare dso_local i32 @printf(i8*, ...) #1
>
> attributes #0 = { noinline nounwind uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "frame-pointer"="all"
> "less-precise-fpmad"="false" "min-legal-vector-width"="0"
> "no-infs-fp-math"="false" "no-jump-tables"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
> attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false" "frame-pointer"="all"
> "less-precise-fpmad"="false" "no-infs-fp-math"="false"
> "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false" "stack-protector-buffer-size"="8"
> "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false" "use-soft-float"="false" }
>
> !llvm.module.flags = !{!0}
> !llvm.ident = !{!1}
>
> !0 = !{i32 1, !"wchar_size", i32 4}
> !1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git
> a3485301d4870f57590d7b69eed7959134a694ab)"}
> !2 = distinct !{!2, !3}
> !3 = !{!"llvm.loop.unroll.disable"}
>
>
> So my problem is:
> With unroll 4 on the loop with 16 bounds I should see one single block for
> the incrementation i=i+4, then 4 instructions for each previous one
> instruction, and the condition should check if i<16. This is the intuitive
> code. However, the incrementation that I get is i=i+1 and there are only 4
> blocks.
>
> Do you know why this happen?
>
>
> I think loop-unroll works as expected in your example, as you can see the
> copies of the unrolled loop blocks (for.body.X, for.inc.X). The reason this
> is not simplified to the single block you are expecting is the input for
> -loop-unroll: -loop-unroll gets the IR without any optimizations (-O0).
>
> For the expected result, you need to run a few additional passes before
> -loop-unroll to promote some of the loads/stores to registers and simplify
> the CFG of the input.  Running `opt -mem2reg -simplifycfg -loop-unroll
> -unroll-count=4 forO0.ll -S` should give you something like
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body.3,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body.3 ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body.3 ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   br label %for.body.1
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
>
> for.body.1:                                       ; preds = %for.body
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   br label %for.body.2
>
> for.body.2:                                       ; preds = %for.body.1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   br label %for.body.3
>
> for.body.3:                                       ; preds = %for.body.2
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
> }
>
> Note that there are still 4 copies of the body instead of a single one.
> Like many passes in LLVM, the loop-unroll pass focuses on performing one
> transformation (duplicating the loop body a number of times) and relies on
> other passes to clean-up/simplify the result. To fold the 4 copies of the
> body into a single block, you need another round of CFG simplifications.
> Running `opt -mem2reg -simplifycfg -loop-unroll -unroll-count=4
> -simplifycfg forO0.ll -S` produces the code below, which is what you are
> looking for IIUC.
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
> }
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200526/e9f9be51/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cfgmain-cc1667.dot
Type: application/msword-template
Size: 5466 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200526/e9f9be51/attachment.bin>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cfgmain-cce7ad.dot
Type: application/msword-template
Size: 5466 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200526/e9f9be51/attachment-0001.bin>