[Mlir-commits] [mlir] [GPUToLLVM] Support multiple async dependencies in gpu.launch_func lowering (PR #188987)
Mehdi Amini
llvmlistbot at llvm.org
Fri Mar 27 08:55:56 PDT 2026
https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/188987
>From 1b24e87b20523f76049ca528b276ebff25277392 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Thu, 26 Mar 2026 15:39:22 -0700
Subject: [PATCH 1/2] [GPUToLLVM] Support multiple async dependencies in
gpu.launch_func lowering
LegalizeLaunchFuncOpPattern previously rejected gpu.launch_func ops
with more than one async dependency. This change removes that limitation
by synchronizing additional dependencies onto the primary stream using
CUDA/HIP events, following the same approach already used in
ConvertWaitAsyncOpToGpuRuntimeCallPattern for gpu.wait async.
For each additional async dependency beyond the first:
- If it is a stream (produced by mgpuStreamCreate), create an event,
record it on that stream, wait for it on the primary stream, then
destroy the event.
- If it is already an event, wait for it directly on the primary stream
and destroy it.
Fixes #156984
Assisted-by: Claude Code
---
.../GPUCommon/GPUToLLVMConversion.cpp | 33 ++++++++++++++---
...ower-launch-func-to-gpu-runtime-calls.mlir | 35 ++++++++++++++++++-
2 files changed, 62 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index d48a0db4d9de0..8d9a3f69b5edc 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -951,10 +951,6 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter)))
return failure();
- if (launchOp.getAsyncDependencies().size() > 1)
- return rewriter.notifyMatchFailure(
- launchOp, "Cannot convert with more than one async dependency.");
-
// Fail when the synchronous version of the op has async dependencies. The
// lowering destroys the stream, and we do not want to check that there is no
// use of the stream after this op.
@@ -965,8 +961,35 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
Location loc = launchOp.getLoc();
Value stream = Value();
- if (!adaptor.getAsyncDependencies().empty())
+ if (!adaptor.getAsyncDependencies().empty()) {
stream = adaptor.getAsyncDependencies().front();
+ // Synchronize additional async dependencies onto the primary stream using
+ // events, following the same approach as gpu.wait async lowering.
+ if (adaptor.getAsyncDependencies().size() > 1) {
+ auto insertionPoint = rewriter.saveInsertionPoint();
+ SmallVector<Value, 4> events;
+ for (auto [origDep, convertedDep] :
+ llvm::zip(launchOp.getAsyncDependencies().drop_front(),
+ adaptor.getAsyncDependencies().drop_front())) {
+ if (isDefinedByCallTo(convertedDep,
+ streamCreateCallBuilder.functionName)) {
+ auto *defOp = origDep.getDefiningOp();
+ rewriter.setInsertionPointAfter(defOp);
+ auto event =
+ eventCreateCallBuilder.create(loc, rewriter, {}).getResult();
+ eventRecordCallBuilder.create(loc, rewriter, {event, convertedDep});
+ events.push_back(event);
+ } else {
+ events.push_back(convertedDep);
+ }
+ }
+ rewriter.restoreInsertionPoint(insertionPoint);
+ for (auto event : events)
+ streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
+ for (auto event : events)
+ eventDestroyCallBuilder.create(loc, rewriter, {event});
+ }
+ }
// If the async keyword is present and there are no dependencies, then a
// stream must be created to pass to subsequent operations.
else if (launchOp.getAsyncToken())
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 6c5c1e09c0eb5..79baa8c894120 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -80,7 +80,7 @@ module attributes {gpu.container_module} {
// CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32
// CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
// CHECK: [[C2:%.*]] = llvm.mlir.constant(2 : index) : i64
- %c8 = arith.constant 8 : index
+ %c8 = arith.constant 8 : index
%c32 = arith.constant 32 : i32
%c256 = arith.constant 256 : i32
%c2 = arith.constant 2 : index
@@ -99,3 +99,36 @@ module attributes {gpu.container_module} {
return
}
}
+
+// -----
+
+// Test that gpu.launch_func async with multiple async dependencies correctly
+// synchronizes all deps onto the primary stream via events.
+module attributes {gpu.container_module} {
+ gpu.module @kernel_module [#nvvm.target] {
+ llvm.func @kernel() attributes {gpu.kernel} {
+ llvm.return
+ }
+ }
+
+ // CHECK-LABEL: @multi_dep_launch
+ func.func @multi_dep_launch() {
+ %c1 = arith.constant 1 : index
+ // CHECK: %[[stream0:.*]] = llvm.call @mgpuStreamCreate()
+ %t0 = gpu.wait async
+ // CHECK: %[[stream1:.*]] = llvm.call @mgpuStreamCreate()
+ %t1 = gpu.wait async
+ // The event for the second dependency is created and recorded after
+ // its defining stream, then the primary stream waits on it.
+ // CHECK: %[[e:.*]] = llvm.call @mgpuEventCreate()
+ // CHECK: llvm.call @mgpuEventRecord(%[[e]], %[[stream1]])
+ // CHECK: llvm.call @mgpuStreamWaitEvent(%[[stream0]], %[[e]])
+ // CHECK: llvm.call @mgpuEventDestroy(%[[e]])
+ // CHECK: gpu.launch_func <%[[stream0]] : !llvm.ptr> @kernel_module::@kernel
+ %t2 = gpu.launch_func async [%t0, %t1] @kernel_module::@kernel
+ blocks in (%c1, %c1, %c1)
+ threads in (%c1, %c1, %c1)
+ gpu.wait [%t2]
+ return
+ }
+}
>From 86b7f63dec5568f3b059f4591eda50c923bf462e Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Fri, 27 Mar 2026 16:55:48 +0100
Subject: [PATCH 2/2] Update
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
Co-authored-by: Fabian Mora <fmora.dev at gmail.com>
---
.../Conversion/GPUCommon/GPUToLLVMConversion.cpp | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 8d9a3f69b5edc..d775e81c9463b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -971,17 +971,17 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
for (auto [origDep, convertedDep] :
llvm::zip(launchOp.getAsyncDependencies().drop_front(),
adaptor.getAsyncDependencies().drop_front())) {
- if (isDefinedByCallTo(convertedDep,
+ if (!isDefinedByCallTo(convertedDep,
streamCreateCallBuilder.functionName)) {
- auto *defOp = origDep.getDefiningOp();
- rewriter.setInsertionPointAfter(defOp);
- auto event =
- eventCreateCallBuilder.create(loc, rewriter, {}).getResult();
- eventRecordCallBuilder.create(loc, rewriter, {event, convertedDep});
- events.push_back(event);
- } else {
events.push_back(convertedDep);
+ continue;
}
+ Operation *defOp = origDep.getDefiningOp();
+ rewriter.setInsertionPointAfter(defOp);
+ Value event =
+ eventCreateCallBuilder.create(loc, rewriter, {}).getResult();
+ eventRecordCallBuilder.create(loc, rewriter, {event, convertedDep});
+ events.push_back(event);
}
rewriter.restoreInsertionPoint(insertionPoint);
for (auto event : events)
More information about the Mlir-commits
mailing list