[llvm-dev] Loop Unroll

Sat May 23 09:15:14 PDT 2020

This is my example (for.c):

#include <stdio.h>

int add(int a, int b) {
    return a + b;
}

int main() {
   int a, b, c, d;
   a = 5;
   b = 15;
   c = add(a, b);
   d = 0;
   for(int i=0;i<16;i++)
       d = add(c, d);
}

I run:
$ clang -O0 -Xclang -disable-O0-optnone -emit-llvm for.c -S -o forO0.ll
$ opt -O0 -S --loop-unroll --unroll-count=4 -view-cfg forO0.ll -o
for-opt00-unroll4.ll

And this is the LLVM IR code that I get:

; ModuleID = 'forO0.ll'
source_filename = "for.c"
target datalayout =
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline nounwind uwtable
define dso_local i32 @add(i32 %a, i32 %b) #0 {
entry:
  %a.addr = alloca i32, align 4
  %b.addr = alloca i32, align 4
  store i32 %a, i32* %a.addr, align 4
  store i32 %b, i32* %b.addr, align 4
  %0 = load i32, i32* %a.addr, align 4
  %1 = load i32, i32* %b.addr, align 4
  %add = add nsw i32 %0, %1
  ret i32 %add
}

; Function Attrs: noinline nounwind uwtable
define dso_local i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  %a = alloca i32, align 4
  %b = alloca i32, align 4
  %c = alloca i32, align 4
  %d = alloca i32, align 4
  %i = alloca i32, align 4
  store i32 0, i32* %retval, align 4
  store i32 5, i32* %a, align 4
  store i32 15, i32* %b, align 4
  %0 = load i32, i32* %a, align 4
  %1 = load i32, i32* %b, align 4
  %call = call i32 @add(i32 %0, i32 %1)
  store i32 %call, i32* %c, align 4
  store i32 0, i32* %d, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc.3,
%entry
  %2 = load i32, i32* %i, align 4
  %cmp = icmp slt i32 %2, 16
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %3 = load i32, i32* %c, align 4
  %4 = load i32, i32* %d, align 4
  %call1 = call i32 @add(i32 %3, i32 %4)
  store i32 %call1, i32* %d, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %5 = load i32, i32* %i, align 4
  %inc = add nsw i32 %5, 1
  store i32 %inc, i32* %i, align 4
  %6 = load i32, i32* %i, align 4
  %cmp.1 = icmp slt i32 %6, 16
  br i1 %cmp.1, label %for.body.1, label %for.end

for.end:                                          ; preds = %for.inc.2,
%for.inc.1, %for.inc, %for.cond
  %7 = load i32, i32* %d, align 4
  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x
i8], [20 x i8]* @.str, i64 0, i64 0), i32 %7)
  %8 = load i32, i32* %retval, align 4
  ret i32 %8

for.body.1:                                       ; preds = %for.inc
  %9 = load i32, i32* %c, align 4
  %10 = load i32, i32* %d, align 4
  %call1.1 = call i32 @add(i32 %9, i32 %10)
  store i32 %call1.1, i32* %d, align 4
  br label %for.inc.1

for.inc.1:                                        ; preds = %for.body.1
  %11 = load i32, i32* %i, align 4
  %inc.1 = add nsw i32 %11, 1
  store i32 %inc.1, i32* %i, align 4
  %12 = load i32, i32* %i, align 4
  %cmp.2 = icmp slt i32 %12, 16
  br i1 %cmp.2, label %for.body.2, label %for.end

for.body.2:                                       ; preds = %for.inc.1
  %13 = load i32, i32* %c, align 4
  %14 = load i32, i32* %d, align 4
  %call1.2 = call i32 @add(i32 %13, i32 %14)
  store i32 %call1.2, i32* %d, align 4
  br label %for.inc.2

for.inc.2:                                        ; preds = %for.body.2
  %15 = load i32, i32* %i, align 4
  %inc.2 = add nsw i32 %15, 1
  store i32 %inc.2, i32* %i, align 4
  %16 = load i32, i32* %i, align 4
  %cmp.3 = icmp slt i32 %16, 16
  br i1 %cmp.3, label %for.body.3, label %for.end

for.body.3:                                       ; preds = %for.inc.2
  %17 = load i32, i32* %c, align 4
  %18 = load i32, i32* %d, align 4
  %call1.3 = call i32 @add(i32 %17, i32 %18)
  store i32 %call1.3, i32* %d, align 4
  br label %for.inc.3

for.inc.3:                                        ; preds = %for.body.3
  %19 = load i32, i32* %i, align 4
  %inc.3 = add nsw i32 %19, 1
  store i32 %inc.3, i32* %i, align 4
  br label %for.cond, !llvm.loop !2
}

declare dso_local i32 @printf(i8*, ...) #1

attributes #0 = { noinline nounwind uwtable
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false" "frame-pointer"="all"
"less-precise-fpmad"="false" "min-legal-vector-width"="0"
"no-infs-fp-math"="false" "no-jump-tables"="false"
"no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
"no-trapping-math"="false" "stack-protector-buffer-size"="8"
"target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false" "frame-pointer"="all"
"less-precise-fpmad"="false" "no-infs-fp-math"="false"
"no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false"
"no-trapping-math"="false" "stack-protector-buffer-size"="8"
"target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git
a3485301d4870f57590d7b69eed7959134a694ab)"}
!2 = distinct !{!2, !3}
!3 = !{!"llvm.loop.unroll.disable"}

So my problem is:
With unroll 4 on the loop with 16 bounds I should see one single block for
the incrementation i=i+4, then 4 instructions for each previous one
instruction, and the condition should check if i<16. This is the intuitive
code. However, the incrementation that I get is i=i+1 and there are only 4
blocks.

Do you know why this happen?

Thanks.

El vie., 22 may. 2020 a las 19:49, Florian Hahn (<florian_hahn at apple.com>)
escribió:

>
>
> > On May 22, 2020, at 09:55, legend xx via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
> >
> > Hi,
> >
> > I'm interesting in find a pass for loop unrolling in LLVM compiler. I
> tried opt --loop-unroll --unroll-count=4, but it don't work well.
> >
> > What pass I can used and how?
> >
>
> -loop-unroll should be the right pass. There are multiple possible reasons
> why the loop is not unrolled and the pass has a bunch of options to
> enable/force unrolling for more cases (see
> https://github.com/llvm/llvm-project/blob/master/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp#L81).
>
>
> Passing `-debug` should give you a better idea why the loop is not
> unrolled. If you would share the IR, someone might be able to provide
> additional insight.
>
> > I would also like to know if there is any way to mark the loops that I
> want them to be unroll
>
> Yes it is possible to explicitly mark loops for unrolling using metadata
> in LLVM IR: https://llvm.org/docs/LangRef.html#llvm-loop-unroll. But the
> metadata might not help, if the loop contains code the unroller does not
> support.
>
> Cheers,
> Florian
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20200523/e1e2bd42/attachment-0001.html>