[Mlir-commits] [mlir] f03826f - Pass GPU events instead of streams across async regions.
Christian Sigg
llvmlistbot at llvm.org
Thu Feb 25 04:18:28 PST 2021
Author: Christian Sigg
Date: 2021-02-25T13:18:18+01:00
New Revision: f03826f896beb7edb68c7576be4721ff8711dacb
URL: https://github.com/llvm/llvm-project/commit/f03826f896beb7edb68c7576be4721ff8711dacb
DIFF: https://github.com/llvm/llvm-project/commit/f03826f896beb7edb68c7576be4721ff8711dacb.diff
LOG: Pass GPU events instead of streams across async regions.
Lower !gpu.async.tokens returned from async.execute regions to events instead of streams.
Make !gpu.async.token returned from !async.execute single-use.
This allows creating one event per use and destroying them without leaking or ref-counting.
Technically we only need this for stream/event-based lowering. I kept the code separate
from the rest of the gpu-async-region pass so that we can make this optional or move
to a separate pass as needed.
Reviewed By: herhut
Differential Revision: https://reviews.llvm.org/D96965
Added:
Modified:
mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
mlir/test/Dialect/GPU/async-region.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
index 5b5761ac63de..3570dc741306 100644
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -18,6 +18,7 @@
#include "../PassDetail.h"
#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"
@@ -51,8 +52,7 @@ class GpuToLLVMConversionPass
void runOnOperation() override;
};
-class FunctionCallBuilder {
-public:
+struct FunctionCallBuilder {
FunctionCallBuilder(StringRef functionName, Type returnType,
ArrayRef<Type> argumentTypes)
: functionName(functionName),
@@ -60,7 +60,6 @@ class FunctionCallBuilder {
LLVM::CallOp create(Location loc, OpBuilder &builder,
ArrayRef<Value> arguments) const;
-private:
StringRef functionName;
LLVM::LLVMFunctionType functionType;
};
@@ -202,6 +201,18 @@ class ConvertDeallocOpToGpuRuntimeCallPattern
ConversionPatternRewriter &rewriter) const override;
};
+class ConvertAsyncYieldToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> {
+public:
+ ConvertAsyncYieldToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {}
+
+private:
+ LogicalResult
+ matchAndRewrite(async::YieldOp yieldOp, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
/// A rewrite pattern to convert gpu.wait operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertWaitOpToGpuRuntimeCallPattern
@@ -429,11 +440,53 @@ LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
-// Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm
-// streams (i.e. void*). The converted op synchronizes the host with every
-// stream and then destroys it. That is, it assumes that the stream is not used
-// afterwards. In case this isn't correct, we will get a runtime error.
-// Eventually, we will have a pass that guarantees this property.
+static bool isGpuAsyncTokenType(Value value) {
+ return value.getType().isa<gpu::AsyncTokenType>();
+}
+
+// Converts !gpu.async.token operands of `async.yield` to runtime calls. The
+// !gpu.async.token are lowered to stream within the async.execute region, but
+// are passed as events between them. For each !gpu.async.token operand, we
+// create an event and record it on the stream.
+LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite(
+ async::YieldOp yieldOp, ArrayRef<Value> operands,
+ ConversionPatternRewriter &rewriter) const {
+ if (llvm::none_of(yieldOp.operands(), isGpuAsyncTokenType))
+ return rewriter.notifyMatchFailure(yieldOp, "no gpu async token operand");
+
+ Location loc = yieldOp.getLoc();
+ SmallVector<Value, 4> newOperands(operands.begin(), operands.end());
+ llvm::SmallDenseSet<Value> streams;
+ for (auto &operand : yieldOp->getOpOperands()) {
+ if (!isGpuAsyncTokenType(operand.get()))
+ continue;
+ auto idx = operand.getOperandNumber();
+ auto stream = operands[idx];
+ auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
+ eventRecordCallBuilder.create(loc, rewriter, {event, stream});
+ newOperands[idx] = event;
+ streams.insert(stream);
+ }
+ for (auto stream : streams)
+ streamDestroyCallBuilder.create(loc, rewriter, {stream});
+
+ rewriter.updateRootInPlace(yieldOp,
+ [&] { yieldOp->setOperands(newOperands); });
+ return success();
+}
+
+// Returns whether `value` is the result of an LLVM::CallOp to `functionName`.
+static bool isDefinedByCallTo(Value value, StringRef functionName) {
+ assert(value.getType().isa<LLVM::LLVMPointerType>());
+ if (auto defOp = value.getDefiningOp<LLVM::CallOp>())
+ return defOp.callee()->equals(functionName);
+ return false;
+}
+
+// Converts `gpu.wait` to runtime calls. The converted op synchronizes the host
+// with the stream/event operands. The operands are destroyed. That is, it
+// assumes that it is not used afterwards or elsewhere. Otherwise we will get a
+// runtime error. Eventually, we should guarantee this property.
LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::WaitOp waitOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
@@ -442,21 +495,28 @@ LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
Location loc = waitOp.getLoc();
- for (auto asyncDependency : operands)
- streamSynchronizeCallBuilder.create(loc, rewriter, {asyncDependency});
- for (auto asyncDependency : operands)
- streamDestroyCallBuilder.create(loc, rewriter, {asyncDependency});
+ for (auto operand : operands) {
+ if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) {
+ // The converted operand's definition created a stream.
+ streamSynchronizeCallBuilder.create(loc, rewriter, {operand});
+ streamDestroyCallBuilder.create(loc, rewriter, {operand});
+ } else {
+ // Otherwise the converted operand is an event. This assumes that we use
+ // events in control flow code as well.
+ eventSynchronizeCallBuilder.create(loc, rewriter, {operand});
+ eventDestroyCallBuilder.create(loc, rewriter, {operand});
+ }
+ }
rewriter.eraseOp(waitOp);
return success();
}
-// Converts `gpu.wait async` to runtime calls. The result is a new stream that
-// is synchronized with all operands, which are CUDA or ROCm streams (i.e.
-// void*). We create and record an event after the definition of the stream
-// and make the new stream wait on that event before destroying it again. This
-// assumes that there is no other use between the definition and this op, and
-// the plan is to have a pass that guarantees this property.
+// Converts `gpu.wait async` to runtime calls. The converted op creates a new
+// stream that is synchronized with stream/event operands. The operands are
+// destroyed. That is, it assumes that it is not used afterwards or elsewhere.
+// Otherwise we will get a runtime error. Eventually, we should guarantee this
+// property.
LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::WaitOp waitOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
@@ -468,18 +528,21 @@ LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
auto insertionPoint = rewriter.saveInsertionPoint();
SmallVector<Value, 1> events;
for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) {
- auto token = std::get<0>(pair);
- if (auto *defOp = token.getDefiningOp()) {
+ auto operand = std::get<1>(pair);
+ if (isDefinedByCallTo(operand, streamCreateCallBuilder.functionName)) {
+ // The converted operand's definition created a stream. Insert an event
+ // into the stream just after the last use of the original token operand.
+ auto *defOp = std::get<0>(pair).getDefiningOp();
rewriter.setInsertionPointAfter(defOp);
+ auto event =
+ eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
+ eventRecordCallBuilder.create(loc, rewriter, {event, operand});
+ events.push_back(event);
} else {
- // If we can't find the defining op, we record the event at block start,
- // which is late and therefore misses parallelism, but still valid.
- rewriter.setInsertionPointToStart(waitOp->getBlock());
+ // Otherwise the converted operand is an event. This assumes that we use
+ // events in control flow code as well.
+ events.push_back(operand);
}
- auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
- auto stream = std::get<1>(pair);
- eventRecordCallBuilder.create(loc, rewriter, {event, stream});
- events.push_back(event);
}
rewriter.restoreInsertionPoint(insertionPoint);
auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
@@ -729,7 +792,8 @@ void mlir::populateGpuToLLVMConversionPatterns(
ConvertHostRegisterOpToGpuRuntimeCallPattern,
ConvertMemcpyOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,
- ConvertWaitOpToGpuRuntimeCallPattern>(converter);
+ ConvertWaitOpToGpuRuntimeCallPattern,
+ ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation);
patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
index fe4395f26859..e5278c7ad39c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -30,6 +30,7 @@ namespace {
class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
struct ThreadTokenCallback;
struct DeferWaitCallback;
+ struct SingleTokenUseCallback;
void runOnFunction() override;
};
} // namespace
@@ -120,6 +121,44 @@ struct GpuAsyncRegionPass::ThreadTokenCallback {
Value currentToken = {};
};
+/// Erases `executeOp` and returns a clone with additional `results`.
+async::ExecuteOp addExecuteResults(async::ExecuteOp executeOp,
+ ValueRange results) {
+ // Add values to async.yield op.
+ Operation *yieldOp = executeOp.getBody()->getTerminator();
+ yieldOp->insertOperands(yieldOp->getNumOperands(), results);
+
+ // Construct new result type list with additional types.
+ SmallVector<Type, 2> resultTypes;
+ resultTypes.reserve(executeOp.getNumResults() + results.size());
+ transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
+ [](Type type) {
+ // Extract value type from !async.value.
+ if (auto valueType = type.dyn_cast<async::ValueType>())
+ return valueType.getValueType();
+ assert(type.isa<async::TokenType>() && "expected token type");
+ return type;
+ });
+ transform(results, std::back_inserter(resultTypes),
+ [](Value value) { return value.getType(); });
+
+ // Clone executeOp with the extra results.
+ OpBuilder builder(executeOp);
+ auto newOp = builder.create<async::ExecuteOp>(
+ executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
+ executeOp.dependencies(), executeOp.operands());
+ BlockAndValueMapping mapper;
+ newOp.getRegion().getBlocks().clear();
+ executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
+
+ // Replace executeOp with cloned one.
+ executeOp.getOperation()->replaceAllUsesWith(
+ newOp.getResults().drop_back(results.size()));
+ executeOp.erase();
+
+ return newOp;
+}
+
// Callback for `async.execute` ops which tries to push the contained
// synchronous `gpu.wait` op to the dependencies of the `async.execute`.
struct GpuAsyncRegionPass::DeferWaitCallback {
@@ -146,69 +185,30 @@ struct GpuAsyncRegionPass::DeferWaitCallback {
for (size_t i = 0; i < worklist.size(); ++i) {
auto waitOp = worklist[i];
auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
- auto numDependencies = waitOp.asyncDependencies().size();
- // Erase `gpu.wait` and return async dependencies from region instead.
- auto &yieldOp = executeOp.getBody()->getOperations().back();
- yieldOp.insertOperands(yieldOp.getNumOperands(),
- waitOp.asyncDependencies());
+ // Erase `gpu.wait` and return async dependencies from execute op instead.
+ SmallVector<Value, 4> dependencies = waitOp.asyncDependencies();
waitOp.erase();
- auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies);
+ executeOp = addExecuteResults(executeOp, dependencies);
// Add the async dependency to each user of the `async.execute` token.
+ auto asyncTokens = executeOp.getResults().take_back(dependencies.size());
for (Operation *user : executeOp.token().getUsers())
addAsyncDependencyAfter(asyncTokens, user);
}
}
private:
- // Append `count` `!async.value<!gpu.async.token>` results to `executeOp`.
- static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp,
- unsigned count) {
- auto numResults = executeOp.getNumResults() + count;
-
- // Construct new result type list with `count` additional types.
- SmallVector<Type, 2> resultTypes;
- resultTypes.reserve(numResults);
- transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
- [](Type type) {
- // Extract value type from !async.value.
- if (auto valueType = type.dyn_cast<async::ValueType>())
- return valueType.getValueType();
- assert(type.isa<async::TokenType>() && "expected token type");
- return type;
- });
- OpBuilder builder(executeOp);
- auto tokenType = builder.getType<gpu::AsyncTokenType>();
- resultTypes.resize(numResults, tokenType);
-
- // Clone executeOp with the extra `!gpu.async.token` results.
- auto newOp = builder.create<async::ExecuteOp>(
- executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
- executeOp.dependencies(), executeOp.operands());
- BlockAndValueMapping mapper;
- newOp.getRegion().getBlocks().clear();
- executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
-
- // Replace executeOp with cloned one.
- executeOp.getOperation()->replaceAllUsesWith(
- newOp.getResults().drop_back(count));
- executeOp.erase();
- executeOp = newOp;
-
- // Return the new result values.
- return executeOp.getResults().take_back(count);
- }
-
// Returns whether all token users are either 'async.execute' or 'async.await'
// ops. This is used as a requirement for pushing 'gpu.wait' ops from a
// 'async.execute' body to it's users. Specifically, we do not allow
// terminator users, because it could mean that the `async.execute` is inside
// control flow code.
static bool areAllUsersExecuteOrAwait(Value token) {
- return llvm::all_of(token.getUsers(), [](Operation *user) {
- return isa<async::ExecuteOp, async::AwaitOp>(user);
- });
+ return !token.use_empty() &&
+ llvm::all_of(token.getUsers(), [](Operation *user) {
+ return isa<async::ExecuteOp, async::AwaitOp>(user);
+ });
}
// Add the `asyncToken` as dependency as needed after `op`.
@@ -268,6 +268,46 @@ struct GpuAsyncRegionPass::DeferWaitCallback {
SmallVector<gpu::WaitOp, 8> worklist;
};
+// Callback for `async.execute` ops which repeats !gpu.async.token results
+// so that each of them is only used once.
+struct GpuAsyncRegionPass::SingleTokenUseCallback {
+ void operator()(async::ExecuteOp executeOp) {
+ // Extract !gpu.async.token results which have multiple uses.
+ auto multiUseResults =
+ llvm::make_filter_range(executeOp.results(), [](OpResult result) {
+ if (result.use_empty() || result.hasOneUse())
+ return false;
+ auto valueType = result.getType().dyn_cast<async::ValueType>();
+ return valueType &&
+ valueType.getValueType().isa<gpu::AsyncTokenType>();
+ });
+ if (multiUseResults.empty())
+ return;
+
+ // Indices within !async.execute results (i.e. without the async.token).
+ SmallVector<int, 4> indices;
+ transform(multiUseResults, std::back_inserter(indices),
+ [](OpResult result) {
+ return result.getResultNumber() - 1; // Index without token.
+ });
+
+ for (auto index : indices) {
+ assert(!executeOp.results()[index].getUses().empty());
+ // Repeat async.yield token result, one for each use after the first one.
+ auto uses = llvm::drop_begin(executeOp.results()[index].getUses());
+ auto count = std::distance(uses.begin(), uses.end());
+ auto yieldOp = cast<async::YieldOp>(executeOp.getBody()->getTerminator());
+ SmallVector<Value, 4> operands(count, yieldOp.getOperand(index));
+ executeOp = addExecuteResults(executeOp, operands);
+ // Update 'uses' to refer to the new executeOp.
+ uses = llvm::drop_begin(executeOp.results()[index].getUses());
+ auto results = executeOp.results().take_back(count);
+ for (auto pair : llvm::zip(uses, results))
+ std::get<0>(pair).set(std::get<1>(pair));
+ }
+ }
+};
+
// Replaces synchronous GPU ops in the op's region with asynchronous ones and
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
// execution semantics and that no GPU ops are asynchronous yet.
@@ -280,6 +320,8 @@ void GpuAsyncRegionPass::runOnFunction() {
// Collect gpu.wait ops that we can move out of async.execute regions.
getFunction().getRegion().walk(DeferWaitCallback());
+ // Makes each !gpu.async.token returned from async.execute op have single use.
+ getFunction().getRegion().walk(SingleTokenUseCallback());
}
std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
diff --git a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
index 3234e885185e..2ee85ecd78ae 100644
--- a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
@@ -12,8 +12,8 @@ module attributes {gpu.container_module} {
// CHECK: llvm.call @mgpuEventDestroy(%[[e0]])
%t1 = gpu.wait async [%t0]
// CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
- // CHECK: llvm.call @mgpuStreamSynchronize(%[[t1]])
// CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
+ // CHECK: llvm.call @mgpuStreamSynchronize(%[[t1]])
// CHECK: llvm.call @mgpuStreamDestroy(%[[t1]])
gpu.wait [%t0, %t1]
return
diff --git a/mlir/test/Dialect/GPU/async-region.mlir b/mlir/test/Dialect/GPU/async-region.mlir
index 84a0356975b0..1a2206c3aa5a 100644
--- a/mlir/test/Dialect/GPU/async-region.mlir
+++ b/mlir/test/Dialect/GPU/async-region.mlir
@@ -125,4 +125,48 @@ module attributes {gpu.container_module} {
// CHECK: return %[[x]] : index
return %x : index
}
+
+ // CHECK-LABEL:func @async_execute_no_use(%{{.*}}: index)
+ func @async_execute_no_use(%sz : index) {
+ // CHECK: async.execute {
+ %a0 = async.execute {
+ // CHECK: %[[t:.*]] = gpu.launch_func async
+ gpu.launch_func @kernels::@kernel
+ blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+ // CHECK: gpu.wait [%[[t]]]
+ async.yield
+ }
+ return
+ }
+
+ // CHECK-LABEL:func @async_execute_fork(%{{.*}}: index)
+ func @async_execute_fork(%sz : index) {
+ // CHECK: %[[a0:.*]], %[[f0:.*]]:2 = async.execute
+ // CHECK-SAME: -> (!async.value<!gpu.async.token>, !async.value<!gpu.async.token>)
+ %a0 = async.execute {
+ // CHECK: %[[t:.*]] = gpu.launch_func async
+ gpu.launch_func @kernels::@kernel
+ blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+ // CHECK-NOT: gpu.wait
+ // CHECK: async.yield %[[t]], %[[t]] : !gpu.async.token, !gpu.async.token
+ async.yield
+ }
+ // CHECK: async.execute [%[[a0]]] (%[[f0]]#0 as {{.*}}: !async.value<!gpu.async.token>)
+ %a1 = async.execute [%a0] {
+ // CHECK: %[[t:.*]] = gpu.launch_func async
+ gpu.launch_func @kernels::@kernel
+ blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+ // CHECK: gpu.wait [%[[t]]]
+ async.yield
+ }
+ // CHECK: async.execute [%[[a0]]] (%[[f0]]#1 as {{.*}}: !async.value<!gpu.async.token>)
+ %a2 = async.execute [%a0] {
+ // CHECK: %[[t:.*]] = gpu.launch_func async
+ gpu.launch_func @kernels::@kernel
+ blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+ // CHECK: gpu.wait [%[[t]]]
+ async.yield
+ }
+ return
+ }
}
More information about the Mlir-commits
mailing list