[llvm-branch-commits] [llvm] [LLVM][Coroutines] Create `.noalloc` variant of switch ABI coroutine ramp functions during CoroSplit (PR #99283)
Yuxuan Chen via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jul 17 00:12:22 PDT 2024
https://github.com/yuxuanchen1997 created https://github.com/llvm/llvm-project/pull/99283
This patch is the middle end implementation for the coroutine HALO improvement project published on discourse: https://discourse.llvm.org/t/language-extension-for-better-more-deterministic-halo-for-c-coroutines/80044/7
Previously CoroElide depends on inlining, and its analysis does not work very well with code generated by the C++ frontend due the existence of many customization points. There has been issue reported to upstream how ineffective the original CoroElide was in real world applications.
For C++ users, this set of patches aim to fix this problem by providing library authors and users deterministic HALO behaviour for some well-behaved coroutine `Task` types. The stack begins with a library side attribute on the `Task` class that guarantees no unstructured concurrency when coroutines are awaited directly with `co_await`ed as a prvalue. This attribute on Task types gives us lifetime guarantees and makes C++ FE capable to telling the ME which coroutine calls are elidable. We convey such information from FE through the attribute `coro_must_elide`.
This patch modifies CoroSplit to create a variant of the coroutine ramp function that 1) does not use heap allocated frame, instead take an additional parameter as the pointer to the frame. Such parameter is attributed with `dereferenceble` and `align` to convey size and align requirements for the frame. 2) always stores cleanup instead of destroy address for `coro.destroy()` actions.
In a later patch, we will have a new pass that runs right after CoroSplit to find usages of the callee coroutine attributed `coro_must_elide` in presplit coroutine callers, allocates the frame on its "stack", transform those usages to call the `noalloc` ramp function variant.
(note I put quotes on the word "stack" here, because for presplit coroutine, any alloca will be spilled into the frame when it's being split)
The C++ Frontend attribute implementation that works with this change can be found at https://github.com/llvm/llvm-project/pull/99282
>From 19bd17bc91ebc3cd03d4a8ea85ec2b1b4cb3f024 Mon Sep 17 00:00:00 2001
From: Yuxuan Chen <ych at meta.com>
Date: Mon, 15 Jul 2024 15:01:39 -0700
Subject: [PATCH] Implement LLVM bits
---
llvm/lib/Transforms/Coroutines/CoroInternal.h | 4 +
llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 123 ++++++++++++++----
llvm/lib/Transforms/Coroutines/Coroutines.cpp | 27 ++++
llvm/test/Transforms/Coroutines/ArgAddr.ll | 2 +-
.../Transforms/Coroutines/coro-alloca-07.ll | 2 +-
.../coro-alloca-loop-carried-address.ll | 2 +-
.../Coroutines/coro-lifetime-end.ll | 6 +-
.../Coroutines/coro-spill-after-phi.ll | 2 +-
.../Transforms/Coroutines/coro-split-00.ll | 7 +
9 files changed, 142 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 5716fd0ea4ab9..d91cccd99a703 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -26,6 +26,10 @@ bool declaresIntrinsics(const Module &M,
const std::initializer_list<StringRef>);
void replaceCoroFree(CoroIdInst *CoroId, bool Elide);
+void suppressCoroAllocs(CoroIdInst *CoroId);
+void suppressCoroAllocs(LLVMContext &Context,
+ ArrayRef<CoroAllocInst *> CoroAllocs);
+
/// Attempts to rewrite the location operand of debug intrinsics in terms of
/// the coroutine frame pointer, folding pointer offsets into the DIExpression
/// of the intrinsic.
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 9e4da5f8ca961..9c0db4f29056e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/PriorityWorklist.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/CFG.h"
@@ -1179,6 +1180,14 @@ static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) {
Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct);
}
+static TypeSize getFrameSizeForShape(coro::Shape &Shape) {
+ // In the same function all coro.sizes should have the same result type.
+ auto *SizeIntrin = Shape.CoroSizes.back();
+ Module *M = SizeIntrin->getModule();
+ const DataLayout &DL = M->getDataLayout();
+ return DL.getTypeAllocSize(Shape.FrameTy);
+}
+
static void replaceFrameSizeAndAlignment(coro::Shape &Shape) {
if (Shape.ABI == coro::ABI::Async)
updateAsyncFuncPointerContextSize(Shape);
@@ -1194,10 +1203,8 @@ static void replaceFrameSizeAndAlignment(coro::Shape &Shape) {
// In the same function all coro.sizes should have the same result type.
auto *SizeIntrin = Shape.CoroSizes.back();
- Module *M = SizeIntrin->getModule();
- const DataLayout &DL = M->getDataLayout();
- auto Size = DL.getTypeAllocSize(Shape.FrameTy);
- auto *SizeConstant = ConstantInt::get(SizeIntrin->getType(), Size);
+ auto *SizeConstant =
+ ConstantInt::get(SizeIntrin->getType(), getFrameSizeForShape(Shape));
for (CoroSizeInst *CS : Shape.CoroSizes) {
CS->replaceAllUsesWith(SizeConstant);
@@ -1455,6 +1462,64 @@ struct SwitchCoroutineSplitter {
setCoroInfo(F, Shape, Clones);
}
+ static Function *createNoAllocVariant(Function &F, coro::Shape &Shape,
+ SmallVectorImpl<Function *> &Clones) {
+ auto *OrigFnTy = F.getFunctionType();
+ auto OldParams = OrigFnTy->params();
+
+ SmallVector<Type *> NewParams;
+ NewParams.reserve(OldParams.size() + 1);
+ for (Type *T : OldParams) {
+ NewParams.push_back(T);
+ }
+ NewParams.push_back(PointerType::getUnqual(Shape.FrameTy));
+
+ auto *NewFnTy = FunctionType::get(OrigFnTy->getReturnType(), NewParams,
+ OrigFnTy->isVarArg());
+ Function *NoAllocF =
+ Function::Create(NewFnTy, F.getLinkage(), F.getName() + ".noalloc");
+ ValueToValueMapTy VMap;
+ unsigned int Idx = 0;
+ for (const auto &I : F.args()) {
+ VMap[&I] = NoAllocF->getArg(Idx++);
+ }
+ SmallVector<ReturnInst *, 4> Returns;
+ CloneFunctionInto(NoAllocF, &F, VMap,
+ CloneFunctionChangeType::LocalChangesOnly, Returns);
+
+ if (Shape.CoroBegin) {
+ auto *NewCoroBegin =
+ cast_if_present<CoroBeginInst>(VMap[Shape.CoroBegin]);
+ auto *NewCoroId = cast<CoroIdInst>(NewCoroBegin->getId());
+ coro::replaceCoroFree(NewCoroId, /*Elide=*/true);
+ coro::suppressCoroAllocs(NewCoroId);
+ NewCoroBegin->replaceAllUsesWith(NoAllocF->getArg(Idx));
+ NewCoroBegin->eraseFromParent();
+ }
+
+ Module *M = F.getParent();
+ M->getFunctionList().insert(M->end(), NoAllocF);
+
+ removeUnreachableBlocks(*NoAllocF);
+ auto NewAttrs = NoAllocF->getAttributes();
+ // We just appended the frame pointer as the last argument of the new
+ // function.
+ auto FrameIdx = NoAllocF->arg_size() - 1;
+ // When we elide allocation, we read these attributes to determine the
+ // frame size and alignment.
+ addFramePointerAttrs(NewAttrs, NoAllocF->getContext(), FrameIdx,
+ Shape.FrameSize, Shape.FrameAlign,
+ /*NoAlias=*/false);
+
+ NoAllocF->setAttributes(NewAttrs);
+
+ Clones.push_back(NoAllocF);
+ // Reset the original function's coro info, make the new noalloc variant
+ // connected to the original ramp function.
+ setCoroInfo(F, Shape, Clones);
+ return NoAllocF;
+ }
+
private:
// Create a resume clone by cloning the body of the original function, setting
// new entry block and replacing coro.suspend an appropriate value to force
@@ -1913,6 +1978,21 @@ class PrettyStackTraceFunction : public PrettyStackTraceEntry {
};
} // namespace
+/// Remove calls to llvm.coro.end in the original function.
+static void removeCoroEndsFromRampFunction(const coro::Shape &Shape) {
+ if (Shape.ABI != coro::ABI::Switch) {
+ for (auto *End : Shape.CoroEnds) {
+ replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, nullptr);
+ }
+ } else {
+ for (llvm::AnyCoroEndInst *End : Shape.CoroEnds) {
+ auto &Context = End->getContext();
+ End->replaceAllUsesWith(ConstantInt::getFalse(Context));
+ End->eraseFromParent();
+ }
+ }
+}
+
static coro::Shape
splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
TargetTransformInfo &TTI, bool OptimizeFrame,
@@ -1932,10 +2012,10 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
simplifySuspendPoints(Shape);
buildCoroutineFrame(F, Shape, TTI, MaterializableCallback);
replaceFrameSizeAndAlignment(Shape);
-
+ bool isNoSuspendCoroutine = Shape.CoroSuspends.empty();
// If there are no suspend points, no split required, just remove
// the allocation and deallocation blocks, they are not needed.
- if (Shape.CoroSuspends.empty()) {
+ if (isNoSuspendCoroutine) {
handleNoSuspendCoroutine(Shape);
} else {
switch (Shape.ABI) {
@@ -1967,22 +2047,13 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
for (DbgVariableRecord *DVR : DbgVariableRecords)
coro::salvageDebugInfo(ArgToAllocaMap, *DVR, Shape.OptimizeFrame,
false /*UseEntryValue*/);
- return Shape;
-}
-/// Remove calls to llvm.coro.end in the original function.
-static void removeCoroEndsFromRampFunction(const coro::Shape &Shape) {
- if (Shape.ABI != coro::ABI::Switch) {
- for (auto *End : Shape.CoroEnds) {
- replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, nullptr);
- }
- } else {
- for (llvm::AnyCoroEndInst *End : Shape.CoroEnds) {
- auto &Context = End->getContext();
- End->replaceAllUsesWith(ConstantInt::getFalse(Context));
- End->eraseFromParent();
- }
+ removeCoroEndsFromRampFunction(Shape);
+
+ if (!isNoSuspendCoroutine && Shape.ABI == coro::ABI::Switch) {
+ SwitchCoroutineSplitter::createNoAllocVariant(F, Shape, Clones);
}
+ return Shape;
}
static void updateCallGraphAfterCoroutineSplit(
@@ -2108,18 +2179,18 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
// Split all the coroutines.
for (LazyCallGraph::Node *N : Coroutines) {
Function &F = N->getFunction();
+
LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F.getName()
<< "\n");
F.setSplittedCoroutine();
SmallVector<Function *, 4> Clones;
- auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- const coro::Shape Shape =
+ coro::Shape Shape =
splitCoroutine(F, Clones, FAM.getResult<TargetIRAnalysis>(F),
OptimizeFrame, MaterializableCallback);
- removeCoroEndsFromRampFunction(Shape);
updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM);
+ auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "CoroSplit", &F)
<< "Split '" << ore::NV("function", F.getName())
@@ -2135,9 +2206,9 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
}
}
- for (auto *PrepareFn : PrepareFns) {
- replaceAllPrepares(PrepareFn, CG, C);
- }
+ for (auto *PrepareFn : PrepareFns) {
+ replaceAllPrepares(PrepareFn, CG, C);
+ }
return PreservedAnalyses::none();
}
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 1a92bc1636257..be257339e0ac4 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -145,6 +145,33 @@ void coro::replaceCoroFree(CoroIdInst *CoroId, bool Elide) {
}
}
+void coro::suppressCoroAllocs(CoroIdInst *CoroId) {
+ SmallVector<CoroAllocInst *, 4> CoroAllocs;
+ for (User *U : CoroId->users())
+ if (auto *CA = dyn_cast<CoroAllocInst>(U))
+ CoroAllocs.push_back(CA);
+
+ if (CoroAllocs.empty())
+ return;
+
+ coro::suppressCoroAllocs(CoroId->getContext(), CoroAllocs);
+}
+
+// Replacing llvm.coro.alloc with false will suppress dynamic
+// allocation as it is expected for the frontend to generate the code that
+// looks like:
+// id = coro.id(...)
+// mem = coro.alloc(id) ? malloc(coro.size()) : 0;
+// coro.begin(id, mem)
+void coro::suppressCoroAllocs(LLVMContext &Context,
+ ArrayRef<CoroAllocInst *> CoroAllocs) {
+ auto *False = ConstantInt::getFalse(Context);
+ for (auto *CA : CoroAllocs) {
+ CA->replaceAllUsesWith(False);
+ CA->eraseFromParent();
+ }
+}
+
static void clear(coro::Shape &Shape) {
Shape.CoroBegin = nullptr;
Shape.CoroEnds.clear();
diff --git a/llvm/test/Transforms/Coroutines/ArgAddr.ll b/llvm/test/Transforms/Coroutines/ArgAddr.ll
index 1fbc8e1d49767..6c18cc19a9c0c 100644
--- a/llvm/test/Transforms/Coroutines/ArgAddr.ll
+++ b/llvm/test/Transforms/Coroutines/ArgAddr.ll
@@ -5,7 +5,7 @@
define nonnull ptr @f(i32 %n) presplitcoroutine {
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @f.resumers)
+; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @{{.*}})
; CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 [[N:%.*]], ptr [[N_ADDR]], align 4
; CHECK-NEXT: [[CALL:%.*]] = tail call ptr @malloc(i32 24)
diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-07.ll b/llvm/test/Transforms/Coroutines/coro-alloca-07.ll
index c81bf333f2059..914fd87ccdffc 100644
--- a/llvm/test/Transforms/Coroutines/coro-alloca-07.ll
+++ b/llvm/test/Transforms/Coroutines/coro-alloca-07.ll
@@ -62,7 +62,7 @@ declare void @free(ptr)
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @f.resumers)
+; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @{{.*}})
; CHECK-NEXT: [[ALLOC:%.*]] = call ptr @malloc(i32 48)
; CHECK-NEXT: [[HDL:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
; CHECK-NEXT: store ptr @f.resume, ptr [[HDL]], align 8
diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll b/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll
index 412327a49dcf2..b132f79f13db1 100644
--- a/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll
+++ b/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll
@@ -7,7 +7,7 @@
define void @foo() presplitcoroutine {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @foo.resumers)
+; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @{{.*}})
; CHECK-NEXT: [[ALLOC:%.*]] = call ptr @malloc(i64 40)
; CHECK-NEXT: [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
; CHECK-NEXT: store ptr @foo.resume, ptr [[VFRAME]], align 8
diff --git a/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll
index 330c61360e20a..d0b856865c215 100644
--- a/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll
+++ b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll
@@ -13,7 +13,7 @@ declare void @consume.i8.array(ptr)
define void @HasNoLifetimeEnd() presplitcoroutine {
; CHECK-LABEL: define void @HasNoLifetimeEnd() {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @HasNoLifetimeEnd.resumers)
+; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @{{.*}})
; CHECK-NEXT: [[ALLOC:%.*]] = call ptr @malloc(i64 16)
; CHECK-NEXT: [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
; CHECK-NEXT: store ptr @HasNoLifetimeEnd.resume, ptr [[VFRAME]], align 8
@@ -50,7 +50,7 @@ exit:
define void @LifetimeEndAfterCoroEnd() presplitcoroutine {
; CHECK-LABEL: define void @LifetimeEndAfterCoroEnd() {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @LifetimeEndAfterCoroEnd.resumers)
+; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @{{.*}})
; CHECK-NEXT: [[ALLOC:%.*]] = call ptr @malloc(i64 16)
; CHECK-NEXT: [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
; CHECK-NEXT: store ptr @LifetimeEndAfterCoroEnd.resume, ptr [[VFRAME]], align 8
@@ -88,7 +88,7 @@ exit:
define void @BranchWithoutLifetimeEnd() presplitcoroutine {
; CHECK-LABEL: define void @BranchWithoutLifetimeEnd() {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @BranchWithoutLifetimeEnd.resumers)
+; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @{{.*}})
; CHECK-NEXT: [[ALLOC:%.*]] = call ptr @malloc(i64 16)
; CHECK-NEXT: [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
; CHECK-NEXT: store ptr @BranchWithoutLifetimeEnd.resume, ptr [[VFRAME]], align 8
diff --git a/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll b/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll
index cbe57a8d61132..41b53d89c5dfe 100644
--- a/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll
+++ b/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll
@@ -8,7 +8,7 @@
define ptr @f(i1 %n) presplitcoroutine {
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @f.resumers)
+; CHECK-NEXT: [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @{{.*}})
; CHECK-NEXT: [[ALLOC:%.*]] = call ptr @malloc(i32 32)
; CHECK-NEXT: [[HDL:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
; CHECK-NEXT: store ptr @f.resume, ptr [[HDL]], align 8
diff --git a/llvm/test/Transforms/Coroutines/coro-split-00.ll b/llvm/test/Transforms/Coroutines/coro-split-00.ll
index b35bd720b86f9..d89938388eb8e 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-00.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-00.ll
@@ -63,6 +63,13 @@ suspend:
; CHECK-NOT: call void @free(
; CHECK: ret void
+; CHECK-LABEL: @f.noalloc({{.*}})
+; CHECK-NOT: call ptr @malloc
+; CHECK: call void @print(i32 0)
+; CHECK-NOT: call void @print(i32 1)
+; CHECK-NOT: call void @free(
+; CHECK: ret ptr %{{.*}}
+
declare ptr @llvm.coro.free(token, ptr)
declare i32 @llvm.coro.size.i32()
declare i8 @llvm.coro.suspend(token, i1)
More information about the llvm-branch-commits
mailing list