[Mlir-commits] [mlir] [GPUToLLVM] Support multiple async dependencies in gpu.launch_func lowering (PR #188987)

Fri Mar 27 08:55:56 PDT 2026

https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/188987

>From 1b24e87b20523f76049ca528b276ebff25277392 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Thu, 26 Mar 2026 15:39:22 -0700
Subject: [PATCH 1/2] [GPUToLLVM] Support multiple async dependencies in
 gpu.launch_func lowering

LegalizeLaunchFuncOpPattern previously rejected gpu.launch_func ops
with more than one async dependency. This change removes that limitation
by synchronizing additional dependencies onto the primary stream using
CUDA/HIP events, following the same approach already used in
ConvertWaitAsyncOpToGpuRuntimeCallPattern for gpu.wait async.

For each additional async dependency beyond the first:
- If it is a stream (produced by mgpuStreamCreate), create an event,
  record it on that stream, wait for it on the primary stream, then
  destroy the event.
- If it is already an event, wait for it directly on the primary stream
  and destroy it.

Fixes #156984

Assisted-by: Claude Code
---
 .../GPUCommon/GPUToLLVMConversion.cpp         | 33 ++++++++++++++---
 ...ower-launch-func-to-gpu-runtime-calls.mlir | 35 ++++++++++++++++++-
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index d48a0db4d9de0..8d9a3f69b5edc 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -951,10 +951,6 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter)))
     return failure();
 
-  if (launchOp.getAsyncDependencies().size() > 1)
-    return rewriter.notifyMatchFailure(
-        launchOp, "Cannot convert with more than one async dependency.");
-
   // Fail when the synchronous version of the op has async dependencies. The
   // lowering destroys the stream, and we do not want to check that there is no
   // use of the stream after this op.
@@ -965,8 +961,35 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   Location loc = launchOp.getLoc();
 
   Value stream = Value();
-  if (!adaptor.getAsyncDependencies().empty())
+  if (!adaptor.getAsyncDependencies().empty()) {
     stream = adaptor.getAsyncDependencies().front();
+    // Synchronize additional async dependencies onto the primary stream using
+    // events, following the same approach as gpu.wait async lowering.
+    if (adaptor.getAsyncDependencies().size() > 1) {
+      auto insertionPoint = rewriter.saveInsertionPoint();
+      SmallVector<Value, 4> events;
+      for (auto [origDep, convertedDep] :
+           llvm::zip(launchOp.getAsyncDependencies().drop_front(),
+                     adaptor.getAsyncDependencies().drop_front())) {
+        if (isDefinedByCallTo(convertedDep,
+                              streamCreateCallBuilder.functionName)) {
+          auto *defOp = origDep.getDefiningOp();
+          rewriter.setInsertionPointAfter(defOp);
+          auto event =
+              eventCreateCallBuilder.create(loc, rewriter, {}).getResult();
+          eventRecordCallBuilder.create(loc, rewriter, {event, convertedDep});
+          events.push_back(event);
+        } else {
+          events.push_back(convertedDep);
+        }
+      }
+      rewriter.restoreInsertionPoint(insertionPoint);
+      for (auto event : events)
+        streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
+      for (auto event : events)
+        eventDestroyCallBuilder.create(loc, rewriter, {event});
+    }
+  }
   // If the async keyword is present and there are no dependencies, then a
   // stream must be created to pass to subsequent operations.
   else if (launchOp.getAsyncToken())
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 6c5c1e09c0eb5..79baa8c894120 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -80,7 +80,7 @@ module attributes {gpu.container_module} {
   // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32
   // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
   // CHECK: [[C2:%.*]] = llvm.mlir.constant(2 : index) : i64
-    %c8 = arith.constant 8 : index    
+    %c8 = arith.constant 8 : index
     %c32 = arith.constant 32 : i32
     %c256 = arith.constant 256 : i32
     %c2 = arith.constant 2 : index
@@ -99,3 +99,36 @@ module attributes {gpu.container_module} {
     return
   }
 }
+
+// -----
+
+// Test that gpu.launch_func async with multiple async dependencies correctly
+// synchronizes all deps onto the primary stream via events.
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module [#nvvm.target] {
+    llvm.func @kernel() attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+
+  // CHECK-LABEL: @multi_dep_launch
+  func.func @multi_dep_launch() {
+    %c1 = arith.constant 1 : index
+    // CHECK: %[[stream0:.*]] = llvm.call @mgpuStreamCreate()
+    %t0 = gpu.wait async
+    // CHECK: %[[stream1:.*]] = llvm.call @mgpuStreamCreate()
+    %t1 = gpu.wait async
+    // The event for the second dependency is created and recorded after
+    // its defining stream, then the primary stream waits on it.
+    // CHECK: %[[e:.*]] = llvm.call @mgpuEventCreate()
+    // CHECK: llvm.call @mgpuEventRecord(%[[e]], %[[stream1]])
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%[[stream0]], %[[e]])
+    // CHECK: llvm.call @mgpuEventDestroy(%[[e]])
+    // CHECK: gpu.launch_func <%[[stream0]] : !llvm.ptr> @kernel_module::@kernel
+    %t2 = gpu.launch_func async [%t0, %t1] @kernel_module::@kernel
+        blocks in (%c1, %c1, %c1)
+        threads in (%c1, %c1, %c1)
+    gpu.wait [%t2]
+    return
+  }
+}

>From 86b7f63dec5568f3b059f4591eda50c923bf462e Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Fri, 27 Mar 2026 16:55:48 +0100
Subject: [PATCH 2/2] Update
 mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

Co-authored-by: Fabian Mora <fmora.dev at gmail.com>
---
 .../Conversion/GPUCommon/GPUToLLVMConversion.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 8d9a3f69b5edc..d775e81c9463b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -971,17 +971,17 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
       for (auto [origDep, convertedDep] :
            llvm::zip(launchOp.getAsyncDependencies().drop_front(),
                      adaptor.getAsyncDependencies().drop_front())) {
-        if (isDefinedByCallTo(convertedDep,
+        if (!isDefinedByCallTo(convertedDep,
                               streamCreateCallBuilder.functionName)) {
-          auto *defOp = origDep.getDefiningOp();
-          rewriter.setInsertionPointAfter(defOp);
-          auto event =
-              eventCreateCallBuilder.create(loc, rewriter, {}).getResult();
-          eventRecordCallBuilder.create(loc, rewriter, {event, convertedDep});
-          events.push_back(event);
-        } else {
           events.push_back(convertedDep);
+          continue;
         }
+        Operation *defOp = origDep.getDefiningOp();
+        rewriter.setInsertionPointAfter(defOp);
+        Value event =
+            eventCreateCallBuilder.create(loc, rewriter, {}).getResult();
+        eventRecordCallBuilder.create(loc, rewriter, {event, convertedDep});
+        events.push_back(event);
       }
       rewriter.restoreInsertionPoint(insertionPoint);
       for (auto event : events)