[Mlir-commits] [mlir] ea8b160 - [GPUToLLVM] Support multiple async dependencies in gpu.launch_func lowering (#188987)

Fri Mar 27 09:09:24 PDT 2026

Author: Mehdi Amini
Date: 2026-03-27T16:09:19Z
New Revision: ea8b1608af6d8bff6a0abb91d209428a99d0e3c3

URL: https://github.com/llvm/llvm-project/commit/ea8b1608af6d8bff6a0abb91d209428a99d0e3c3
DIFF: https://github.com/llvm/llvm-project/commit/ea8b1608af6d8bff6a0abb91d209428a99d0e3c3.diff

LOG: [GPUToLLVM] Support multiple async dependencies in gpu.launch_func lowering (#188987)

LegalizeLaunchFuncOpPattern previously rejected gpu.launch_func ops with
more than one async dependency. This change removes that limitation by
synchronizing additional dependencies onto the primary stream using
CUDA/HIP events, following the same approach already used in
ConvertWaitAsyncOpToGpuRuntimeCallPattern for gpu.wait async.

For each additional async dependency beyond the first:
- If it is a stream (produced by mgpuStreamCreate), create an event,
record it on that stream, wait for it on the primary stream, then
destroy the event.
- If it is already an event, wait for it directly on the primary stream
and destroy it.

Fixes #156984

Assisted-by: Claude Code

Added: 
    

Modified: 
    mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
    mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index d48a0db4d9de0..3e99c537d0e02 100644

--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -951,10 +951,6 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter)))
     return failure();
 
-  if (launchOp.getAsyncDependencies().size() > 1)
-    return rewriter.notifyMatchFailure(
-        launchOp, "Cannot convert with more than one async dependency.");
-
   // Fail when the synchronous version of the op has async dependencies. The
   // lowering destroys the stream, and we do not want to check that there is no
   // use of the stream after this op.
@@ -965,8 +961,35 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   Location loc = launchOp.getLoc();
 
   Value stream = Value();
-  if (!adaptor.getAsyncDependencies().empty())
+  if (!adaptor.getAsyncDependencies().empty()) {
     stream = adaptor.getAsyncDependencies().front();
+    // Synchronize additional async dependencies onto the primary stream using
+    // events, following the same approach as gpu.wait async lowering.
+    if (adaptor.getAsyncDependencies().size() > 1) {
+      auto insertionPoint = rewriter.saveInsertionPoint();
+      SmallVector<Value, 4> events;
+      for (auto [origDep, convertedDep] :
+           llvm::zip(launchOp.getAsyncDependencies().drop_front(),
+                     adaptor.getAsyncDependencies().drop_front())) {
+        if (!isDefinedByCallTo(convertedDep,
+                               streamCreateCallBuilder.functionName)) {
+          events.push_back(convertedDep);
+          continue;
+        }
+        Operation *defOp = origDep.getDefiningOp();
+        rewriter.setInsertionPointAfter(defOp);
+        Value event =
+            eventCreateCallBuilder.create(loc, rewriter, {}).getResult();
+        eventRecordCallBuilder.create(loc, rewriter, {event, convertedDep});
+        events.push_back(event);
+      }
+      rewriter.restoreInsertionPoint(insertionPoint);
+      for (Value event : events)
+        streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
+      for (Value event : events)
+        eventDestroyCallBuilder.create(loc, rewriter, {event});
+    }
+  }
   // If the async keyword is present and there are no dependencies, then a
   // stream must be created to pass to subsequent operations.
   else if (launchOp.getAsyncToken())

diff  --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 6c5c1e09c0eb5..79baa8c894120 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -80,7 +80,7 @@ module attributes {gpu.container_module} {
   // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32
   // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
   // CHECK: [[C2:%.*]] = llvm.mlir.constant(2 : index) : i64
-    %c8 = arith.constant 8 : index    
+    %c8 = arith.constant 8 : index
     %c32 = arith.constant 32 : i32
     %c256 = arith.constant 256 : i32
     %c2 = arith.constant 2 : index
@@ -99,3 +99,36 @@ module attributes {gpu.container_module} {
     return
   }
 }
+
+// -----
+
+// Test that gpu.launch_func async with multiple async dependencies correctly
+// synchronizes all deps onto the primary stream via events.
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module [#nvvm.target] {
+    llvm.func @kernel() attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+
+  // CHECK-LABEL: @multi_dep_launch
+  func.func @multi_dep_launch() {
+    %c1 = arith.constant 1 : index
+    // CHECK: %[[stream0:.*]] = llvm.call @mgpuStreamCreate()
+    %t0 = gpu.wait async
+    // CHECK: %[[stream1:.*]] = llvm.call @mgpuStreamCreate()
+    %t1 = gpu.wait async
+    // The event for the second dependency is created and recorded after
+    // its defining stream, then the primary stream waits on it.
+    // CHECK: %[[e:.*]] = llvm.call @mgpuEventCreate()
+    // CHECK: llvm.call @mgpuEventRecord(%[[e]], %[[stream1]])
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%[[stream0]], %[[e]])
+    // CHECK: llvm.call @mgpuEventDestroy(%[[e]])
+    // CHECK: gpu.launch_func <%[[stream0]] : !llvm.ptr> @kernel_module::@kernel
+    %t2 = gpu.launch_func async [%t0, %t1] @kernel_module::@kernel
+        blocks in (%c1, %c1, %c1)
+        threads in (%c1, %c1, %c1)
+    gpu.wait [%t2]
+    return
+  }
+}