[Mlir-commits] [mlir] [mlir][gpu] Preserve explicit async objects in gpu.launch_func lowering (PR #195436)

Tue May 5 00:42:05 PDT 2026

https://github.com/SabYic updated https://github.com/llvm/llvm-project/pull/195436

>From d789899989066234f146623824a0efa7593c76ba Mon Sep 17 00:00:00 2001
From: SabYic <sunwenjia04 at 163.com>
Date: Sat, 2 May 2026 16:21:01 +0800
Subject: [PATCH 1/2] [mlir][gpu] Respect explicit async objects when lowering
 gpu.launch_func with async dependencies

When gpu.launch_func carried both async dependencies and an explicit
async object, GPUToLLVMConversion reused the first dependency stream as
the primary launch stream and ignored the explicit async object.

Fix this by preserving the explicit async object as the launch stream
and synchronizing async dependencies onto it. Dependencies lowered to
streams are converted to events at the launch boundary as needed.
When no explicit async object is present, the existing behavior is
preserved.

Add a regression test covering gpu.launch_func lowering with an
explicit async object and an async dependency.

 Changes to be committed:
	modified:   mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
	new file:   mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir

	new file:   mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir

 Changes to be committed:
	modified:   ../mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
---
 .../GPUCommon/GPUToLLVMConversion.cpp         | 26 +++++++++-----
 .../lower-launch-func-async-object.mlir       | 36 +++++++++++++++++++
 2 files changed, 53 insertions(+), 9 deletions(-)
 create mode 100644 mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 21301110cbd42..ad7b2d837072f 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -969,17 +969,25 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
 
   Location loc = launchOp.getLoc();
 
-  Value stream = Value();
+  Value stream = adaptor.getAsyncObject();
+  ValueRange origDeps = launchOp.getAsyncDependencies();
+  ValueRange convertedDeps = adaptor.getAsyncDependencies();
   if (!adaptor.getAsyncDependencies().empty()) {
-    stream = adaptor.getAsyncDependencies().front();
-    // Synchronize additional async dependencies onto the primary stream using
-    // events, following the same approach as gpu.wait async lowering.
-    if (adaptor.getAsyncDependencies().size() > 1) {
+    bool hasExplicitAsyncObject = !!stream;
+    if (!hasExplicitAsyncObject) {
+      stream = convertedDeps.front();
+      origDeps = origDeps.drop_front();
+      convertedDeps = convertedDeps.drop_front();
+    }
+
+    // Synchronize async dependencies onto the primary stream using events,
+    // following the same approach as gpu.wait async lowering. Without an
+    // explicit async object, the first dependency is reused as the primary
+    // stream and therefore skipped here.
+    if (!convertedDeps.empty()) {
       auto insertionPoint = rewriter.saveInsertionPoint();
       SmallVector<Value, 4> events;
-      for (auto [origDep, convertedDep] :
-           llvm::zip(launchOp.getAsyncDependencies().drop_front(),
-                     adaptor.getAsyncDependencies().drop_front())) {
+      for (auto [origDep, convertedDep] : llvm::zip(origDeps, convertedDeps)) {
         if (!isDefinedByCallTo(convertedDep,
                                streamCreateCallBuilder.functionName)) {
           events.push_back(convertedDep);
@@ -1001,7 +1009,7 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   }
   // If the async keyword is present and there are no dependencies, then a
   // stream must be created to pass to subsequent operations.
-  else if (launchOp.getAsyncToken())
+  else if (launchOp.getAsyncToken() && !stream)
     stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult();
 
   // Lower the kernel operands to match kernel parameters.
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir
new file mode 100644
index 0000000000000..03ce6e6a0fe07
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt %s --gpu-to-llvm -split-input-file | FileCheck %s
+
+// Regression test for gpu.launch_func lowering with an explicit async object.
+//
+// In `gpu-to-llvm`, when `gpu.launch_func` carries both async dependencies and
+// an explicit async object, the explicit async object must remain the launch
+// stream. Async dependencies lowered to streams must instead be converted to
+// events recorded on those streams, and the explicit async object must wait on
+// those events before launching the kernel.
+//
+// The bug was that `gpu.launch_func` lowering always reused the first async
+// dependency stream as the primary launch stream. As a result, an explicit
+// async object was ignored, and the launch happened on the wrong stream.
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module [#nvvm.target] {
+    llvm.func @kernel() attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+
+  // CHECK-LABEL: @launch_with_async_object
+  func.func @launch_with_async_object(%stream : !llvm.ptr) {
+    %c1 = arith.constant 1 : index
+    // CHECK: %[[dep:.*]] = llvm.call @mgpuStreamCreate()
+    %dep = gpu.wait async
+    // CHECK: %[[event:.*]] = llvm.call @mgpuEventCreate()
+    // CHECK: llvm.call @mgpuEventRecord(%[[event]], %[[dep]])
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%arg0, %[[event]])
+    // CHECK: gpu.launch_func <%arg0 : !llvm.ptr> @kernel_module::@kernel
+    %t = gpu.launch_func async [%dep] <%stream : !llvm.ptr> @kernel_module::@kernel
+        blocks in (%c1, %c1, %c1)
+        threads in (%c1, %c1, %c1)
+    gpu.wait [%t]
+    return
+  }
+}

>From 2aa393bba9f98ad7a0429c81ccbc01cce27afb66 Mon Sep 17 00:00:00 2001
From: SabYic <sunwenjia04 at 163.com>
Date: Tue, 5 May 2026 15:41:34 +0800
Subject: [PATCH 2/2] 	modified:  
 mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp 	modified:  
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp 	deleted:   
 mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir 
 modified:   mlir/test/Dialect/GPU/invalid.mlir

---
 .../GPUCommon/GPUToLLVMConversion.cpp         | 28 ++++++---------
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        |  4 +++
 .../lower-launch-func-async-object.mlir       | 36 -------------------
 mlir/test/Dialect/GPU/invalid.mlir            | 18 ++++++++++
 4 files changed, 32 insertions(+), 54 deletions(-)
 delete mode 100644 mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index ad7b2d837072f..271293a6d3496 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -969,25 +969,17 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
 
   Location loc = launchOp.getLoc();
 
-  Value stream = adaptor.getAsyncObject();
-  ValueRange origDeps = launchOp.getAsyncDependencies();
-  ValueRange convertedDeps = adaptor.getAsyncDependencies();
+  Value stream = Value();
   if (!adaptor.getAsyncDependencies().empty()) {
-    bool hasExplicitAsyncObject = !!stream;
-    if (!hasExplicitAsyncObject) {
-      stream = convertedDeps.front();
-      origDeps = origDeps.drop_front();
-      convertedDeps = convertedDeps.drop_front();
-    }
-
-    // Synchronize async dependencies onto the primary stream using events,
-    // following the same approach as gpu.wait async lowering. Without an
-    // explicit async object, the first dependency is reused as the primary
-    // stream and therefore skipped here.
-    if (!convertedDeps.empty()) {
+    stream = adaptor.getAsyncDependencies().front();
+    // Synchronize additional async dependencies onto the primary stream using
+    // events, following the same approach as gpu.wait async lowering.
+    if (adaptor.getAsyncDependencies().size() > 1) {
       auto insertionPoint = rewriter.saveInsertionPoint();
       SmallVector<Value, 4> events;
-      for (auto [origDep, convertedDep] : llvm::zip(origDeps, convertedDeps)) {
+      for (auto [origDep, convertedDep] :
+           llvm::zip(launchOp.getAsyncDependencies().drop_front(),
+                     adaptor.getAsyncDependencies().drop_front())) {
         if (!isDefinedByCallTo(convertedDep,
                                streamCreateCallBuilder.functionName)) {
           events.push_back(convertedDep);
@@ -1009,7 +1001,7 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
   }
   // If the async keyword is present and there are no dependencies, then a
   // stream must be created to pass to subsequent operations.
-  else if (launchOp.getAsyncToken() && !stream)
+  else if (launchOp.getAsyncToken())
     stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult();
 
   // Lower the kernel operands to match kernel parameters.
@@ -1917,4 +1909,4 @@ void mlir::gpu::registerConvertGpuToLLVMInterface(DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
     gpu::GPUModuleOp::attachInterface<GPUModuleOpConvertToLLVMInterface>(*ctx);
   });
-}
+}
\ No newline at end of file
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index d3fb6df2010d2..f776129c77405 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1341,6 +1341,10 @@ LogicalResult LaunchFuncOp::verify() {
              << "expects types of the cluster dimensions must be the same";
   }
 
+  if (!getAsyncDependencies().empty() && getAsyncObject())
+    return emitOpError(
+        "cannot have both async dependencies and an explicit async object");
+
   return success();
 }
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir
deleted file mode 100644
index 03ce6e6a0fe07..0000000000000
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-async-object.mlir
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: mlir-opt %s --gpu-to-llvm -split-input-file | FileCheck %s
-
-// Regression test for gpu.launch_func lowering with an explicit async object.
-//
-// In `gpu-to-llvm`, when `gpu.launch_func` carries both async dependencies and
-// an explicit async object, the explicit async object must remain the launch
-// stream. Async dependencies lowered to streams must instead be converted to
-// events recorded on those streams, and the explicit async object must wait on
-// those events before launching the kernel.
-//
-// The bug was that `gpu.launch_func` lowering always reused the first async
-// dependency stream as the primary launch stream. As a result, an explicit
-// async object was ignored, and the launch happened on the wrong stream.
-module attributes {gpu.container_module} {
-  gpu.module @kernel_module [#nvvm.target] {
-    llvm.func @kernel() attributes {gpu.kernel} {
-      llvm.return
-    }
-  }
-
-  // CHECK-LABEL: @launch_with_async_object
-  func.func @launch_with_async_object(%stream : !llvm.ptr) {
-    %c1 = arith.constant 1 : index
-    // CHECK: %[[dep:.*]] = llvm.call @mgpuStreamCreate()
-    %dep = gpu.wait async
-    // CHECK: %[[event:.*]] = llvm.call @mgpuEventCreate()
-    // CHECK: llvm.call @mgpuEventRecord(%[[event]], %[[dep]])
-    // CHECK: llvm.call @mgpuStreamWaitEvent(%arg0, %[[event]])
-    // CHECK: gpu.launch_func <%arg0 : !llvm.ptr> @kernel_module::@kernel
-    %t = gpu.launch_func async [%dep] <%stream : !llvm.ptr> @kernel_module::@kernel
-        blocks in (%c1, %c1, %c1)
-        threads in (%c1, %c1, %c1)
-    gpu.wait [%t]
-    return
-  }
-}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index bf862b2c5ae3c..76c22431a2293 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -222,6 +222,24 @@ module attributes {gpu.container_module} {
 
 // -----
 
+module attributes {gpu.container_module} {
+  gpu.module @kernels {
+    gpu.func @kernel_1() kernel {
+      gpu.return
+    }
+  }
+
+  func.func @launch_func_async_deps_and_async_object(%sz : index, %stream : !llvm.ptr) {
+    %dep = gpu.wait async
+    // expected-error at +1 {{cannot have both async dependencies and an explicit async object}}
+    %t = gpu.launch_func async [%dep] <%stream : !llvm.ptr> @kernels::@kernel_1
+        blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
+    return
+  }
+}
+
+// -----
+
 module attributes {gpu.container_module} {
   gpu.module @kernels {
     gpu.func @kernel_1(%arg1 : !llvm.ptr) {