[llvm-dev] Strange coroutine optimizations

Sat Oct 6 11:21:54 PDT 2018

Hi,

I've been using LLVM coroutines with great success over the last couple
months. Everything works nicely, but I encountered the following strange
optimization quirk. Take a look at the following two cases, which I'd
expect to produce identical optimized code:

### Case 1: Constant argument

Code (simple integer iterator up to a specified value):

```
def range(n: Int) -> Int {
  var i = 0
  while i < n {
    yield i
    i = i + 1
  }
}

for i in range(2) {
  print i
}
```

Generated IR (similar structure to the examples in the coroutine docs):

```
define private void @main([...]) {
preamble:
  [...]
  %2 = alloca i64, i64 1
  br label %entry

entry:                                            ; preds = %preamble
  %3 = call i8* @range(i64 2)
  br label %for

for_cont:                                         ; No predecessors!
  br label %for

for:                                              ; preds = %body,
%for_cont, %entry
  call void @llvm.coro.resume(i8* %3)
  %4 = call i1 @llvm.coro.done(i8* %3)
  br i1 %4, label %cleanup, label %body

body:                                             ; preds = %for
  %5 = call i8* @llvm.coro.promise(i8* %3, i32 8, i1 false)
  %6 = bitcast i8* %5 to i64*
  %7 = load i64, i64* %6
  store i64 %7, i64* %2
  %8 = load i64, i64* %2
  call void @print_int(i64 %8)
  br label %for

cleanup:                                          ; preds = %for
  call void @llvm.coro.destroy(i8* %3)
  br label %exit

exit:                                             ; preds = %cleanup
  ret void
}

define private i8* @range(i64) {
preamble:
  %promise = alloca i64, i64 1
  %1 = bitcast i64* %promise to i8*
  %id = call token @llvm.coro.id(i32 0, i8* %1, i8* null, i8* null)
  %2 = alloca i64, i64 1
  store i64 %0, i64* %2
  %3 = alloca i64, i64 1
  %4 = call i1 @llvm.coro.alloc(token %id)
  br i1 %4, label %alloc, label %entry

alloc:                                            ; preds = %preamble
  %5 = call i64 @llvm.coro.size.i64()
  %6 = call i8* @alloc(i64 %5)
  br label %entry

entry:                                            ; preds = %preamble,
%alloc
  %7 = phi i8* [ null, %preamble ], [ %6, %alloc ]
  %hdl = call i8* @llvm.coro.begin(token %id, i8* %7)
  %8 = call i8 @llvm.coro.suspend(token none, i1 false)
  switch i8 %8, label %suspend [
    i8 0, label %10
    i8 1, label %cleanup
  ]

final:                                            ; preds = %exit
  %9 = call i8 @llvm.coro.suspend(token none, i1 true)
  switch i8 %9, label %suspend [
    i8 0, label %21
    i8 1, label %cleanup
  ]

; <label>:10:                                     ; preds = %entry
  store i64 0, i64* %3
  br label %while

while:                                            ; preds = %18, %10
  %11 = load i64, i64* %3
  %12 = load i64, i64* %2
  %13 = icmp slt i64 %11, %12
  %14 = zext i1 %13 to i8
  %15 = trunc i8 %14 to i1
  br i1 %15, label %body, label %exit

body:                                             ; preds = %while
  %16 = load i64, i64* %3
  store i64 %16, i64* %promise
  %17 = call i8 @llvm.coro.suspend(token none, i1 false)
  switch i8 %17, label %suspend [
    i8 0, label %18
    i8 1, label %cleanup
  ]

; <label>:18:                                     ; preds = %body
  %19 = load i64, i64* %3
  %20 = add i64 %19, 1
  store i64 %20, i64* %3
  br label %while

exit:                                             ; preds = %while
  br label %final

; <label>:21:                                     ; preds = %final
  unreachable

cleanup:                                          ; preds = %final, %body,
%entry
  %22 = call i8* @llvm.coro.free(token %id, i8* %hdl)
  br label %suspend

suspend:                                          ; preds = %final, %body,
%entry, %cleanup
  %23 = call i1 @llvm.coro.end(i8* %hdl, i1 false)
  ret i8* %hdl
}
```

Optimized IR:

```
define i32 @main([...]) local_unnamed_addr {
entry:
  [...]
  tail call void @print_int(i64 0)
  tail call void @print_int(i64 1)
  ret i32 0
}
```

Everything works as expected in this case.

### Case 2: Inline constant

Almost identical code, but get rid of the argument and just paste the `2`
into the function:

```
def range() -> Int {
  var i = 0
  while i < 2 {
    yield i
    i = i + 1
  }
}

for i in range() {
  print i
}
```

Unoptimized LLVM code is almost identical, except `range` is called without
arguments (and the corresponding `alloca`+`store` in the first block of
`range` is gone) and the 2nd argument of the `icmp` is a constant `2`.

But strangely the optimized code is very different:

```
define i32 @main([...]) local_unnamed_addr {
entry:
  [...]
  br label %body.i

body.i:                                           ; preds =
%body.i.backedge, %exit
  %.sroa.6.1.i = phi i64 [ 0, %exit ], [ %.sroa.6.1.i.be, %body.i.backedge ]
  %.sroa.11.1.i = phi i2 [ 1, %exit ], [ %.sroa.11.1.i.be, %body.i.backedge
]
  tail call void @print_int(i64 %.sroa.6.1.i)
  switch i2 %.sroa.11.1.i, label %unreachable.i8.i [
    i2 0, label %body.i.backedge
    i2 1, label %AfterCoroSuspend11.i5.i
    i2 -2, label %main.exit
  ]

AfterCoroSuspend11.i5.i:                          ; preds = %body.i
  br label %body.i.backedge

body.i.backedge:                                  ; preds =
%AfterCoroSuspend11.i5.i, %body.i
  %.sroa.6.1.i.be = phi i64 [ 1, %AfterCoroSuspend11.i5.i ], [ 0, %body.i ]
  %.sroa.11.1.i.be = phi i2 [ -2, %AfterCoroSuspend11.i5.i ], [ 1, %body.i ]
  br label %body.i

unreachable.i8.i:                                 ; preds = %body.i
  unreachable

main.exit:                                        ; preds = %body.i
  ret i32 0
}
```

What's going on here? Am I doing something wrong when optimizing through
the C++ API? This is my optimization code:

```
codegen(module);
// verifyModule etc.

std::unique_ptr<legacy::PassManager> pm(new legacy::PassManager());
std::unique_ptr<legacy::FunctionPassManager> fpm(new
legacy::FunctionPassManager(module));

unsigned optLevel = 3;
unsigned sizeLevel = 0;
PassManagerBuilder builder;

if (!debug) {
builder.OptLevel = optLevel;
builder.SizeLevel = sizeLevel;
builder.Inliner = createFunctionInliningPass(optLevel, sizeLevel, false);
builder.DisableUnitAtATime = false;
builder.DisableUnrollLoops = false;
builder.LoopVectorize = true;
builder.SLPVectorize = true;
}

builder.MergeFunctions = true;
addCoroutinePassesToExtensionPoints(builder);
builder.populateModulePassManager(*pm);
builder.populateFunctionPassManager(*fpm);

fpm->doInitialization();
for (Function& f : *module)
fpm->run(f);
fpm->doFinalization();

pm->run(*module);
```

Any help would be much appreciated.

Many Thanks,
Ariya
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20181006/5b523901/attachment-0001.html>