[llvm] 1cedc51 - [Coroutines] Don't merge readnone calls in presplit coroutines

Sun Oct 16 19:37:18 PDT 2022

Author: Chuanqi Xu
Date: 2022-10-17T10:22:43+08:00
New Revision: 1cedc51ff5a2a4f51a4c7e8bb04bd145ac05fa50

URL: https://github.com/llvm/llvm-project/commit/1cedc51ff5a2a4f51a4c7e8bb04bd145ac05fa50
DIFF: https://github.com/llvm/llvm-project/commit/1cedc51ff5a2a4f51a4c7e8bb04bd145ac05fa50.diff

LOG: [Coroutines] Don't  merge readnone calls in presplit coroutines

Another alternative to fix the thread identification problem in
coroutines.

We plan to fix this problem by unifying memory effecting attributes. See
https://discourse.llvm.org/t/rfc-unify-memory-effect-attributes/65579.
But it may be a long-term project. And it is a pity that the coroutines
can't resume in different threads for years. So this one is temporary
fix. It may cause unnecessary performance regression for coroutines. But
correctness are more important. And this one is planned to be reverted
after we are able to unify the memory effecting attributes actually.

Reviewed By: jdoerfert, rjmccall

Differential Revision: https://reviews.llvm.org/D135550

Added: 
    llvm/test/Transforms/Coroutines/coro-readnone-02.ll
    llvm/test/Transforms/Coroutines/coro-readnone.ll

Modified: 
    llvm/docs/ReleaseNotes.rst
    llvm/lib/Transforms/Scalar/EarlyCSE.cpp
    llvm/lib/Transforms/Scalar/GVN.cpp
    llvm/lib/Transforms/Scalar/NewGVN.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c4fdabc20fc7..6b4711ff061e 100644

--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -42,6 +42,11 @@ Non-comprehensive list of changes in this release
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
+*  The ``readnone`` calls which are crossing suspend points in coroutines will
+   not be merged. Since ``readnone`` calls may access thread id and thread id
+   is not a constant in coroutines. This decision may cause unnecessary
+   performance regressions and we plan to fix it in later versions.
+
 * ...
 
 Update on required toolchains to build LLVM

diff  --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index abc1ded4025b..429a98b4a6d2 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -132,7 +132,15 @@ struct SimpleValue {
         }
         }
       }
-      return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+      return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy() &&
+             // FIXME: Currently the calls which may access the thread id may
+             // be considered as not accessing the memory. But this is
+             // problematic for coroutines, since coroutines may resume in a
+             // 
diff erent thread. So we disable the optimization here for the
+             // correctness. However, it may block many other correct
+             // optimizations. Revert this one when we detect the memory
+             // accessing kind more precisely.
+             !CI->getFunction()->isPresplitCoroutine();
     }
     return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
            isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
@@ -463,7 +471,15 @@ struct CallValue {
       return false;
 
     CallInst *CI = dyn_cast<CallInst>(Inst);
-    if (!CI || !CI->onlyReadsMemory())
+    if (!CI || !CI->onlyReadsMemory() ||
+        // FIXME: Currently the calls which may access the thread id may
+        // be considered as not accessing the memory. But this is
+        // problematic for coroutines, since coroutines may resume in a
+        // 
diff erent thread. So we disable the optimization here for the
+        // correctness. However, it may block many other correct
+        // optimizations. Revert this one when we detect the memory
+        // accessing kind more precisely.
+        CI->getFunction()->isPresplitCoroutine())
       return false;
     return true;
   }

diff  --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index c62aaae8206e..a489a890f664 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -450,12 +450,28 @@ void GVNPass::ValueTable::add(Value *V, uint32_t num) {
 }
 
 uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
-  if (AA->doesNotAccessMemory(C)) {
+  if (AA->doesNotAccessMemory(C) &&
+      // FIXME: Currently the calls which may access the thread id may
+      // be considered as not accessing the memory. But this is
+      // problematic for coroutines, since coroutines may resume in a
+      // 
diff erent thread. So we disable the optimization here for the
+      // correctness. However, it may block many other correct
+      // optimizations. Revert this one when we detect the memory
+      // accessing kind more precisely.
+      !C->getFunction()->isPresplitCoroutine()) {
     Expression exp = createExpr(C);
     uint32_t e = assignExpNewValueNum(exp).first;
     valueNumbering[C] = e;
     return e;
-  } else if (MD && AA->onlyReadsMemory(C)) {
+  } else if (MD && AA->onlyReadsMemory(C) &&
+             // FIXME: Currently the calls which may access the thread id may
+             // be considered as not accessing the memory. But this is
+             // problematic for coroutines, since coroutines may resume in a
+             // 
diff erent thread. So we disable the optimization here for the
+             // correctness. However, it may block many other correct
+             // optimizations. Revert this one when we detect the memory
+             // accessing kind more precisely.
+             !C->getFunction()->isPresplitCoroutine()) {
     Expression exp = createExpr(C);
     auto ValNum = assignExpNewValueNum(exp);
     if (ValNum.second) {

diff  --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index d1397395ed24..afa346ea320e 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1607,6 +1607,17 @@ NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
       return ExprResult::some(createVariableOrConstant(ReturnedValue));
     }
   }
+
+  // FIXME: Currently the calls which may access the thread id may
+  // be considered as not accessing the memory. But this is
+  // problematic for coroutines, since coroutines may resume in a
+  // 
diff erent thread. So we disable the optimization here for the
+  // correctness. However, it may block many other correct
+  // optimizations. Revert this one when we detect the memory
+  // accessing kind more precisely.
+  if (CI->getFunction()->isPresplitCoroutine())
+    return ExprResult::none();
+
   if (AA->doesNotAccessMemory(CI)) {
     return ExprResult::some(
         createCallExpression(CI, TOPClass->getMemoryLeader()));

diff  --git a/llvm/test/Transforms/Coroutines/coro-readnone-02.ll b/llvm/test/Transforms/Coroutines/coro-readnone-02.ll
new file mode 100644
index 000000000000..eede209fbdd0
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-readnone-02.ll
@@ -0,0 +1,81 @@
+; Tests that the readnone function which don't cross suspend points could be optimized expectly after split.
+;
+; RUN: opt < %s -S -passes='default<O3>' | FileCheck %s --check-prefixes=CHECK_SPLITTED
+; RUN: opt < %s -S -passes='coro-split,early-cse,simplifycfg' | FileCheck %s --check-prefixes=CHECK_SPLITTED
+; RUN: opt < %s -S -passes='coro-split,gvn,simplifycfg' | FileCheck %s --check-prefixes=CHECK_SPLITTED
+; RUN: opt < %s -S -passes='coro-split,newgvn,simplifycfg' | FileCheck %s --check-prefixes=CHECK_SPLITTED
+; RUN: opt < %s -S -passes='early-cse' | FileCheck %s --check-prefixes=CHECK_UNSPLITTED
+; RUN: opt < %s -S -passes='gvn' | FileCheck %s --check-prefixes=CHECK_UNSPLITTED
+; RUN: opt < %s -S -passes='newgvn' | FileCheck %s --check-prefixes=CHECK_UNSPLITTED
+
+define ptr @f() presplitcoroutine {
+entry:
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call ptr @malloc(i32 %size)
+  %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc)
+  %sus_result = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sus_result, label %suspend [i8 0, label %resume
+                                         i8 1, label %cleanup]
+resume:
+  %i = call i32 @readnone_func() readnone
+  ; noop call to break optimization to combine two consecutive readonly calls.
+  call void @nop()
+  %j = call i32 @readnone_func() readnone
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %same, label %
diff 
+
+same:
+  call void @print_same()
+  br label %cleanup
+
+
diff :
+  call void @print_
diff ()
+  br label %cleanup
+
+cleanup:
+  %mem = call ptr @llvm.coro.free(token %id, ptr %hdl)
+  call void @free(ptr %mem)
+  br label %suspend
+
+suspend:
+  call i1 @llvm.coro.end(ptr %hdl, i1 0)
+  ret ptr %hdl
+}
+
+;
+; CHECK_SPLITTED-LABEL: f.resume(
+; CHECK_SPLITTED-NEXT:  :
+; CHECK_SPLITTED-NEXT:    call i32 @readnone_func() #[[ATTR_NUM:[0-9]+]]
+; CHECK_SPLITTED-NEXT:    call void @nop()
+; CHECK_SPLITTED-NEXT:    call void @print_same()
+;
+; CHECK_SPLITTED: attributes #[[ATTR_NUM]] = { readnone }
+;
+; CHECK_UNSPLITTED-LABEL: @f(
+; CHECK_UNSPLITTED: br i1 %cmp, label %same, label %
diff 
+; CHECK_UNSPLITTED-EMPTY:
+; CHECK_UNSPLITTED-NEXT: same:
+; CHECK_UNSPLITTED-NEXT:   call void @print_same()
+; CHECK_UNSPLITTED-NEXT:   br label
+; CHECK_UNSPLITTED-EMPTY:
+; CHECK_UNSPLITTED-NEXT: 
diff :
+; CHECK_UNSPLITTED-NEXT:   call void @print_
diff ()
+; CHECK_UNSPLITTED-NEXT:   br label
+
+declare i32 @readnone_func() readnone
+declare void @nop()
+
+declare void @print_same()
+declare void @print_
diff ()
+declare ptr @llvm.coro.free(token, ptr)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+
+declare token @llvm.coro.id(i32, ptr, ptr, ptr)
+declare i1 @llvm.coro.alloc(token)
+declare ptr @llvm.coro.begin(token, ptr)
+declare i1 @llvm.coro.end(ptr, i1)
+
+declare noalias ptr @malloc(i32)
+declare void @free(ptr)

diff  --git a/llvm/test/Transforms/Coroutines/coro-readnone.ll b/llvm/test/Transforms/Coroutines/coro-readnone.ll
new file mode 100644
index 000000000000..b7bcff45c410
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-readnone.ll
@@ -0,0 +1,89 @@
+; Tests that the readnone function which cross suspend points wouldn't be misoptimized.
+; RUN: opt < %s -S -passes='default<O3>' | FileCheck %s --check-prefixes=CHECK,CHECK_SPLITTED
+; RUN: opt < %s -S -passes='early-cse' | FileCheck %s --check-prefixes=CHECK,CHECK_UNSPLITTED
+; RUN: opt < %s -S -passes='gvn' | FileCheck %s --check-prefixes=CHECK,CHECK_UNSPLITTED
+; RUN: opt < %s -S -passes='newgvn' | FileCheck %s --check-prefixes=CHECK,CHECK_UNSPLITTED
+
+define ptr @f() presplitcoroutine {
+entry:
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call ptr @malloc(i32 %size)
+  %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc)
+  %j = call i32 @readnone_func() readnone
+  %sus_result = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sus_result, label %suspend [i8 0, label %resume
+                                         i8 1, label %cleanup]
+resume:
+  %i = call i32 @readnone_func() readnone
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %same, label %
diff 
+
+same:
+  call void @print_same()
+  br label %cleanup
+
+
diff :
+  call void @print_
diff ()
+  br label %cleanup
+
+cleanup:
+  %mem = call ptr @llvm.coro.free(token %id, ptr %hdl)
+  call void @free(ptr %mem)
+  br label %suspend
+
+suspend:
+  call i1 @llvm.coro.end(ptr %hdl, i1 0)
+  ret ptr %hdl
+}
+
+; Tests that normal functions wouldn't be affected.
+define i1 @normal_function() {
+entry:
+  %i = call i32 @readnone_func() readnone
+  %j = call i32 @readnone_func() readnone
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %same, label %
diff 
+
+same:
+  call void @print_same()
+  ret i1 true
+
+
diff :
+  call void @print_
diff ()
+  ret i1 false
+}
+
+; CHECK_SPLITTED-LABEL: normal_function(
+; CHECK_SPLITTED-NEXT: entry
+; CHECK_SPLITTED-NEXT:   call i32 @readnone_func()
+; CHECK_SPLITTED-NEXT:   call void @print_same()
+; CHECK_SPLITTED-NEXT:   ret i1 true
+;
+; CHECK_SPLITTED-LABEL: f.resume(
+; CHECK_UNSPLITTED-LABEL: @f(
+; CHECK: br i1 %cmp, label %same, label %
diff 
+; CHECK-EMPTY:
+; CHECK-NEXT: same:
+; CHECK-NEXT:   call void @print_same()
+; CHECK-NEXT:   br label
+; CHECK-EMPTY:
+; CHECK-NEXT: 
diff :
+; CHECK-NEXT:   call void @print_
diff ()
+; CHECK-NEXT:   br label
+
+declare i32 @readnone_func() readnone
+
+declare void @print_same()
+declare void @print_
diff ()
+declare ptr @llvm.coro.free(token, ptr)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+
+declare token @llvm.coro.id(i32, ptr, ptr, ptr)
+declare i1 @llvm.coro.alloc(token)
+declare ptr @llvm.coro.begin(token, ptr)
+declare i1 @llvm.coro.end(ptr, i1)
+
+declare noalias ptr @malloc(i32)
+declare void @free(ptr)