[llvm] c8755b6 - [Coroutines] Optimize the lifespan of temporary co_await object

Xun Li via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 28 10:18:25 PDT 2020


Author: Xun Li
Date: 2020-06-28T10:18:15-07:00
New Revision: c8755b6378c2a1f32d9a90bad6c56a1cc5a830c3

URL: https://github.com/llvm/llvm-project/commit/c8755b6378c2a1f32d9a90bad6c56a1cc5a830c3
DIFF: https://github.com/llvm/llvm-project/commit/c8755b6378c2a1f32d9a90bad6c56a1cc5a830c3.diff

LOG: [Coroutines] Optimize the lifespan of temporary co_await object

Summary:
If we ever assign co_await to a temporary variable, such as foo(co_await expr),
we generate AST that looks like this: MaterializedTemporaryExpr(CoawaitExpr(...)).
MaterializedTemporaryExpr would emit an intrinsics that marks the lifetime start of the
temporary storage. However such temporary storage will not be used until co_await is ready
to write the result. Marking the lifetime start way too early causes extra storage to be
put in the coroutine frame instead of the stack.
As you can see from https://godbolt.org/z/zVx_eB, the frame generated for get_big_object2 is 12K, which contains a big_object object unnecessarily.
After this patch, the frame size for get_big_object2 is now only 8K. There are still room for improvements, in particular, GCC has a 4K frame for this function. But that's a separate problem and not addressed in this patch.

The basic idea of this patch is during CoroSplit, look for every local variable in the coroutine created through AllocaInst, identify all the lifetime start/end markers and the use of the variables, and sink the lifetime.start maker to the places as close to the first-ever use as possible.

Reviewers: lewissbaker, modocache, junparser

Reviewed By: junparser

Subscribers: hiraditya, llvm-commits, rsmith, ChuanqiXu, cfe-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D82314

Added: 
    llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll

Modified: 
    llvm/lib/Transforms/Coroutines/CoroSplit.cpp
    llvm/test/Transforms/Coroutines/coro-split-02.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 4fd127656f21..0841cebab51c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -75,7 +75,7 @@ using namespace llvm;
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -563,7 +563,7 @@ void CoroCloner::replaceEntryBlock() {
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it ends by
-  // branching to the original beginning of the coroutine.  Make this 
+  // branching to the original beginning of the coroutine.  Make this
   // the entry block of the cloned function.
   auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
   auto *OldEntry = &NewF->getEntryBlock();
@@ -1239,6 +1239,103 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
   S.resize(N);
 }
 
+/// For every local variable that has lifetime intrinsics markers, we sink
+/// their lifetime.start marker to the places where the variable is being
+/// used for the first time. Doing so minimizes the lifetime of each variable,
+/// hence minimizing the amount of data we end up putting on the frame.
+static void sinkLifetimeStartMarkers(Function &F) {
+  DominatorTree Dom(F);
+  for (Instruction &I : instructions(F)) {
+    // We look for this particular pattern:
+    //   %tmpX = alloca %.., align ...
+    //   %0 = bitcast %...* %tmpX to i8*
+    //   call void @llvm.lifetime.start.p0i8(i64 ..., i8* nonnull %0) #2
+    if (!isa<AllocaInst>(&I))
+      continue;
+    // There can be multiple lifetime start markers for the same variable.
+    SmallPtrSet<IntrinsicInst *, 1> LifetimeStartInsts;
+    // SinkBarriers stores all instructions that use this local variable.
+    // When sinking the lifetime start intrinsics, we can never sink past
+    // these barriers.
+    SmallPtrSet<Instruction *, 4> SinkBarriers;
+    bool Valid = true;
+    auto AddSinkBarrier = [&](Instruction *I) {
+      // When adding a new barrier to SinkBarriers, we maintain the case
+      // that no instruction in SinkBarriers dominates another instruction.
+      SmallPtrSet<Instruction *, 1> ToRemove;
+      bool ShouldAdd = true;
+      for (Instruction *S : SinkBarriers) {
+        if (I == S || Dom.dominates(S, I)) {
+          ShouldAdd = false;
+          break;
+        } else if (Dom.dominates(I, S)) {
+          ToRemove.insert(S);
+        }
+      }
+      if (ShouldAdd) {
+        SinkBarriers.insert(I);
+        for (Instruction *R : ToRemove) {
+          SinkBarriers.erase(R);
+        }
+      }
+    };
+    for (User *U : I.users()) {
+      if (!isa<BitCastInst>(U))
+        continue;
+      for (User *CU : U->users()) {
+        // If we see any user of CastInst that's not lifetime start/end
+        // intrinsics, give up because it's too complex.
+        if (auto *CUI = dyn_cast<IntrinsicInst>(CU)) {
+          if (CUI->getIntrinsicID() == Intrinsic::lifetime_start)
+            LifetimeStartInsts.insert(CUI);
+          else if (CUI->getIntrinsicID() == Intrinsic::lifetime_end)
+            AddSinkBarrier(CUI);
+          else
+            Valid = false;
+        } else {
+          Valid = false;
+        }
+      }
+    }
+    if (!Valid || LifetimeStartInsts.empty())
+      continue;
+
+    for (User *U : I.users()) {
+      if (isa<BitCastInst>(U))
+        continue;
+      // Every user of the variable is also a sink barrier.
+      AddSinkBarrier(cast<Instruction>(U));
+    }
+
+    // For each sink barrier, we insert a lifetime start marker right
+    // before it.
+    for (Instruction *S : SinkBarriers) {
+      if (auto *IS = dyn_cast<IntrinsicInst>(S)) {
+        if (IS->getIntrinsicID() == Intrinsic::lifetime_end) {
+          // If we have a lifetime end marker in SinkBarriers, meaning it's
+          // not dominated by any other users, we can safely delete it.
+          IS->eraseFromParent();
+          continue;
+        }
+      }
+      // We find an existing lifetime.start marker that domintes the barrier,
+      // clone it and insert it right before the barrier. We cannot clone an
+      // arbitrary lifetime.start marker because we want to make sure the
+      // BitCast instruction referred in the marker also dominates the barrier.
+      for (const IntrinsicInst *LifetimeStart : LifetimeStartInsts) {
+        if (Dom.dominates(LifetimeStart, S)) {
+          LifetimeStart->clone()->insertBefore(S);
+          break;
+        }
+      }
+    }
+    // All the old lifetime.start markers are no longer necessary.
+    for (IntrinsicInst *S : LifetimeStartInsts) {
+      S->eraseFromParent();
+    }
+  }
+}
+
 static void splitSwitchCoroutine(Function &F, coro::Shape &Shape,
                                  SmallVectorImpl<Function *> &Clones) {
   assert(Shape.ABI == coro::ABI::Switch);
@@ -1428,6 +1525,7 @@ static coro::Shape splitCoroutine(Function &F,
     return Shape;
 
   simplifySuspendPoints(Shape);
+  sinkLifetimeStartMarkers(F);
   buildCoroutineFrame(F, Shape);
   replaceFrameSize(Shape);
 

diff  --git a/llvm/test/Transforms/Coroutines/coro-split-02.ll b/llvm/test/Transforms/Coroutines/coro-split-02.ll
index 993374291f41..6ab0015d1fb1 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-02.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-02.ll
@@ -31,6 +31,8 @@ await.ready:
   %val = load i32, i32* %Result.i19
   %cast = bitcast i32* %testval to i8*
   call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
   call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
   call void @print(i32 %val)
   br label %exit
@@ -47,6 +49,8 @@ exit:
 ; CHECK-NEXT:    %val = load i32, i32* %Result
 ; CHECK-NEXT:    %cast = bitcast i32* %testval to i8*
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+; CHECK-NEXT:    %test = load i32, i32* %testval
+; CHECK-NEXT:    call void @print(i32 %test)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
 ; CHECK-NEXT:    call void @print(i32 %val)
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
new file mode 100644
index 000000000000..2d6b28a2baf8
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
@@ -0,0 +1,71 @@
+; Tests that coro-split will optimize the lifetime.start maker of each local variable,
+; sink them to the places closest to the actual use.
+; RUN: opt < %s -coro-split -S | FileCheck %s
+; RUN: opt < %s -passes=coro-split -S | FileCheck %s
+
+%"struct.std::coroutine_handle" = type { i8* }
+%"struct.std::coroutine_handle.0" = type { %"struct.std::coroutine_handle" }
+%"struct.lean_future<int>::Awaiter" = type { i32, %"struct.std::coroutine_handle.0" }
+
+declare i8* @malloc(i64)
+declare void @print(i32)
+
+define void @a() "coroutine.presplit"="1" {
+entry:
+  %ref.tmp7 = alloca %"struct.lean_future<int>::Awaiter", align 8
+  %testval = alloca i32
+  %cast = bitcast i32* %testval to i8*
+  ; lifetime of %testval starts here, but not used until await.ready.
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %alloc = call i8* @malloc(i64 16) #3
+  %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %save = call token @llvm.coro.save(i8* null)
+  %Result.i19 = getelementptr inbounds %"struct.lean_future<int>::Awaiter", %"struct.lean_future<int>::Awaiter"* %ref.tmp7, i64 0, i32 0
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  %StrayCoroSave = call token @llvm.coro.save(i8* null)
+  %val = load i32, i32* %Result.i19
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
+  call void @print(i32 %val)
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(i8* null, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @a.resume(
+; CHECK:         %testval = alloca i32, align 4
+; CHECK-NEXT:    getelementptr inbounds %a.Frame
+; CHECK-NEXT:    getelementptr inbounds %"struct.lean_future<int>::Awaiter"
+; CHECK-NEXT:    %cast1 = bitcast i32* %testval to i8*
+; CHECK-NEXT:    %val = load i32, i32* %Result
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast1)
+; CHECK-NEXT:    %test = load i32, i32* %testval
+; CHECK-NEXT:    call void @print(i32 %test)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast1)
+; CHECK-NEXT:    call void @print(i32 %val)
+; CHECK-NEXT:    ret void
+
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*)
+declare i1 @llvm.coro.alloc(token) #3
+declare noalias nonnull i8* @"\01??2 at YAPEAX_K@Z"(i64) local_unnamed_addr
+declare i64 @llvm.coro.size.i64() #5
+declare i8* @llvm.coro.begin(token, i8* writeonly) #3
+declare void @"\01?puts@@YAXZZ"(...)
+declare token @llvm.coro.save(i8*) #3
+declare i8* @llvm.coro.frame() #5
+declare i8 @llvm.coro.suspend(token, i1) #3
+declare void @"\01??3 at YAXPEAX@Z"(i8*) local_unnamed_addr #10
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #2
+declare i1 @llvm.coro.end(i8*, i1) #3
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #4
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #4
+


        


More information about the llvm-commits mailing list