[Mlir-commits] [mlir] [mlir][gpu] Preserve explicit async objects in gpu.launch_func lowering (PR #195436)

Sat May 2 05:34:42 PDT 2026

https://github.com/SabYic updated https://github.com/llvm/llvm-project/pull/195436

>From d789899989066234f146623824a0efa7593c76ba Mon Sep 17 00:00:00 2001
From: SabYic <sunwenjia04 at 163.com>
Date: Sat, 2 May 2026 16:21:01 +0800
Subject: [PATCH] [mlir][gpu] Respect explicit async objects when lowering
 gpu.launch_func with async dependencies

When gpu.launch_func carried both async dependencies and an explicit
async object, GPUToLLVMConversion reused the first dependency stream as
the primary launch stream and ignored the explicit async object.

Fix this by preserving the explicit async object as the launch stream
and synchronizing async dependencies onto it. Dependencies lowered to
streams are converted to events at the launch boundary as needed.
When no explicit async object is present, the existing behavior is
preserved.

Add a regression test covering gpu.launch_func lowering with an
explicit async object and an async dependency.

 Changes to be committed:
	modified:   mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
	new file:   mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir

	new file:   mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir

 Changes to be committed:
	modified:   ../mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
---
 .../GPUCommon/GPUToLLVMConversion.cpp         | 26 +++++++++-----
 .../lower-launch-func-async-object.mlir       | 36 +++++++++++++++++++
 2 files changed, 53 insertions(+), 9 deletions(-)
 create mode 100644 mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 21301110cbd42..ad7b2d837072f 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -969,17 +969,25 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
 
   Location loc = launchOp.getLoc();
 
-  Value stream = Value();
+  Value stream = adaptor.getAsyncObject();
+  ValueRange origDeps = launchOp.getAsyncDependencies();
+  ValueRange convertedDeps = adaptor.getAsyncDependencies();
   if (!adaptor.getAsyncDependencies().empty()) {
-    stream = adaptor.getAsyncDependencies().front();
-    // Synchronize additional async dependencies onto the primary stream using
-    // events, following the same approach as gpu.wait async lowering.
-    if (adaptor.getAsyncDependencies().size() > 1) {
+    bool hasExplicitAsyncObject = !!stream;
+    if (!hasExplicitAsyncObject) {
+      stream = convertedDeps.front();
+      origDeps = origDeps.drop_front();
+      convertedDeps = convertedDeps.drop_front();
+    }
+
+    // Synchronize async dependencies onto the primary stream using events,
+    // following the same approach as gpu.wait async lowering. Without an
+    // explicit async object, the first dependency is reused as the primary
+    // stream and therefore skipped here.
+    if (!convertedDeps.empty()) {
       auto insertionPoint = rewriter.saveInsertionPoint();
       SmallVector<Value, 4> events;
-      for (auto [origDep, convertedDep] :
-           llvm::zip(launchOp.getAsyncDependencies().drop_front(),
-                     adaptor.getAsyncDependencies().drop_front())) {
+      for (auto [origDep, convertedDep] : llvm::zip(origDeps, convertedDeps)) {
         if (!isDefinedByCallTo(convertedDep,
                                streamCreateCallBuilder.functionName)) {
           events.push_back(convertedDep);
@@ -1001,7 +1009,7 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   }
   // If the async keyword is present and there are no dependencies, then a
   // stream must be created to pass to subsequent operations.
-  else if (launchOp.getAsyncToken())
+  else if (launchOp.getAsyncToken() && !stream)
     stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult();
 
   // Lower the kernel operands to match kernel parameters.
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir
new file mode 100644
index 0000000000000..03ce6e6a0fe07
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt %s --gpu-to-llvm -split-input-file | FileCheck %s
+
+// Regression test for gpu.launch_func lowering with an explicit async object.
+//
+// In `gpu-to-llvm`, when `gpu.launch_func` carries both async dependencies and
+// an explicit async object, the explicit async object must remain the launch
+// stream. Async dependencies lowered to streams must instead be converted to
+// events recorded on those streams, and the explicit async object must wait on
+// those events before launching the kernel.
+//
+// The bug was that `gpu.launch_func` lowering always reused the first async
+// dependency stream as the primary launch stream. As a result, an explicit
+// async object was ignored, and the launch happened on the wrong stream.
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module [#nvvm.target] {
+    llvm.func @kernel() attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+
+  // CHECK-LABEL: @launch_with_async_object
+  func.func @launch_with_async_object(%stream : !llvm.ptr) {
+    %c1 = arith.constant 1 : index
+    // CHECK: %[[dep:.*]] = llvm.call @mgpuStreamCreate()
+    %dep = gpu.wait async
+    // CHECK: %[[event:.*]] = llvm.call @mgpuEventCreate()
+    // CHECK: llvm.call @mgpuEventRecord(%[[event]], %[[dep]])
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%arg0, %[[event]])
+    // CHECK: gpu.launch_func <%arg0 : !llvm.ptr> @kernel_module::@kernel
+    %t = gpu.launch_func async [%dep] <%stream : !llvm.ptr> @kernel_module::@kernel
+        blocks in (%c1, %c1, %c1)
+        threads in (%c1, %c1, %c1)
+    gpu.wait [%t]
+    return
+  }
+}