[Mlir-commits] [mlir] [MLIR][gpu-to-llvm] Support multiple async dependencies when calling `gpu.launch_func` (PR #165475)
Ian Li
llvmlistbot at llvm.org
Tue Oct 28 13:49:28 PDT 2025
https://github.com/ianayl created https://github.com/llvm/llvm-project/pull/165475
Adds a new pattern to `gpu-to-llvm` pass that generates additional `gpu.wait`s before `gpu.launch_func` in the case that `gpu.launch_func` has multiple async dependencies. Without this PR, `gpu.launch_func` ops would fail to convert/legalize to GPU runtime calls when multiple async dependencies are provided.
Notes/Request for comment:
- This PR was written in a way such that support for other operations that also require a single async dependency, i.e. `gpu.alloc` can also benefit from the pattern and implicitly support multiple async dependencies. The question is, would people like this in MLIR?
- Additionally, should this be an option instead of enabled by default?
>From faaab9e463827fc9469f07fc16b43f08b0df544a Mon Sep 17 00:00:00 2001
From: Ian Li <ian.li at intel.com>
Date: Mon, 27 Oct 2025 15:54:15 -0700
Subject: [PATCH 1/3] initial commit
---
.../mlir/Conversion/GPUCommon/GPUCommonPass.h | 4 +
.../GPUCommon/GPUToLLVMConversion.cpp | 76 +++++++++++++++++++
.../GPUCommon/lower-multiple-async-deps.mlir | 30 ++++++++
3 files changed, 110 insertions(+)
create mode 100644 mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 25b094ebdecb2..a55b46eecee65 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -73,6 +73,10 @@ using MemorySpaceMapping = std::function<unsigned(gpu::AddressSpace)>;
/// gpu.address_space to integer values.
void populateGpuMemorySpaceAttributeConversions(
TypeConverter &typeConverter, const MemorySpaceMapping &mapping);
+
+/// TODO name this better
+void populateGpuMultipleAsyncDepsConversionPatterns(
+ RewritePatternSet &patterns);
} // namespace mlir
#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 5994b64f3d9a5..fcef7b481a810 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -483,6 +483,14 @@ class ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern
ConversionPatternRewriter &rewriter) const override;
};
+template <typename Op>
+struct ConvertMultipleAsyncDepsToGpuWaitPattern final : OpRewritePattern<Op> {
+ using OpRewritePattern<Op>::OpRewritePattern;
+
+ LogicalResult
+ matchAndRewrite(Op op, PatternRewriter &rewriter) const override;
+};
+
/// Generic rewriting rule for operation on sparse matrices.
/// Currently supports CUDA (by means of cuSparse and cuSparseLt).
#define DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(op_name) \
@@ -538,6 +546,20 @@ void GpuToLLVMConversionPass::runOnOperation() {
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
return signalPassFailure();
}
+ //
+ if (true /*handle multiple async deps enabled*/) {
+ RewritePatternSet patternss(&getContext());
+ populateGpuMultipleAsyncDepsConversionPatterns(patternss);
+ if (failed(applyPatternsGreedily(getOperation(), std::move(patternss)))) {
+ return signalPassFailure();
+ }
+ }
+
+ // TODO wrap this in debug macros
+ llvm::errs() << "--- IR After applyPatternsGreedily ---\n";
+ getOperation()->print(llvm::errs());
+ llvm::errs() << "--------------------------------------\n\n";
+
LowerToLLVMOptions options(context);
options.useBarePtrCallConv = hostBarePtrCallConv;
@@ -1787,6 +1809,27 @@ LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
+template<class Op>
+LogicalResult ConvertMultipleAsyncDepsToGpuWaitPattern<Op>::matchAndRewrite(
+ Op op, PatternRewriter &rewriter) const {
+ if (op.getAsyncDependencies().size() <= 1)
+ return rewriter.notifyMatchFailure(
+ op, "Can only convert ops with multiple async dependencies.");
+
+ // Create a new gpu.wait with the original async deps.
+ Type tokenType = rewriter.getType<gpu::AsyncTokenType>();
+ Value waitToken = gpu::WaitOp::create(rewriter, op.getLoc(), tokenType, op.getAsyncDependencies()).getAsyncToken();
+
+ // TODO is it safe to just do getAsyncDependenciesMutable on the original op?
+ Operation *newOp = rewriter.clone(*op.getOperation());
+ auto OpAdaptor = dyn_cast<Op>(newOp);
+ assert(OpAdaptor && "Expected cloned op to have same type as original op.");
+ OpAdaptor.getAsyncDependenciesMutable().assign({waitToken});
+ rewriter.replaceOp(op, newOp);
+
+ return success();
+}
+
void mlir::populateGpuToLLVMConversionPatterns(
LLVMTypeConverter &converter, RewritePatternSet &patterns,
bool kernelBarePtrCallConv, bool kernelIntersperseSizeCallConv) {
@@ -1830,6 +1873,39 @@ void mlir::populateGpuToLLVMConversionPatterns(
kernelIntersperseSizeCallConv);
}
+void mlir::populateGpuMultipleAsyncDepsConversionPatterns(
+ RewritePatternSet &patterns) {
+ // gpu::AllocOp,
+ // gpu::DeallocOp,
+ // gpu::MemcpyOp,
+ // gpu::MemsetOp,
+ // gpu::CreateDnTensorOp,
+ // gpu::DestroyDnTensorOp,
+ // gpu::CreateCooOp,
+ // gpu::CreateCooAoSOp,
+ // gpu::CreateCsrOp,
+ // gpu::Create2To4SpMatOp,
+ // gpu::DestroySpMatOp,
+ // gpu::SpMVBufferSizeOp,
+ // gpu::SpMVOp,
+ // gpu::SpMMBufferSizeOp,
+ // gpu::SDDMMBufferSizeOp,
+ // gpu::SpMMOp,
+ // gpu::SDDMMOp,
+ // gpu::SpGEMMCreateDescrOp,
+ // gpu::SpGEMMDestroyDescrOp,
+ // gpu::SpGEMMWorkEstimationOrComputeOp,
+ // gpu::SpGEMMCopyOp,
+ // gpu::SpMatGetSizeOp,
+ // gpu::SetCsrPointersOp,
+ // gpu::CreateCscOp,
+ // gpu::CreateBsrOp,
+ // gpu::LaunchFuncOp
+ patterns.add<
+ ConvertMultipleAsyncDepsToGpuWaitPattern<gpu::LaunchFuncOp>
+ >(patterns.getContext());
+}
+
//===----------------------------------------------------------------------===//
// GPUModuleOp convert to LLVM op interface
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir b/mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir
new file mode 100644
index 0000000000000..799a49280bbed
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+ // func.func @foo(%size : index) -> memref<?xf32> {
+ // %t0 = gpu.wait async
+ // %t1 = gpu.wait async [%t0]
+ // %0 = gpu.alloc [%t0, %t1] (%size) : memref<?xf32>
+ // // gpu.wait [%1]
+ // return %0 : memref<?xf32>
+ // }
+
+ gpu.module @foo {
+ gpu.func @bar() kernel {
+ gpu.return
+ }
+ }
+
+ func.func @main() {
+ %c1 = arith.constant 1 : index
+
+ %t0 = gpu.wait async
+ %t1 = gpu.wait async [%t0]
+ %token = gpu.launch_func async [%t0, %t1] @foo::@bar
+ blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+ gpu.wait [%token]
+ return
+ }
+
+}
>From a631506f9c23f979d2a4425f669841435f38d1d6 Mon Sep 17 00:00:00 2001
From: Ian Li <ian.li at intel.com>
Date: Tue, 28 Oct 2025 12:33:26 -0700
Subject: [PATCH 2/3] fix comments
---
.../mlir/Conversion/GPUCommon/GPUCommonPass.h | 4 +-
mlir/include/mlir/Conversion/Passes.td | 1 +
.../GPUCommon/GPUToLLVMConversion.cpp | 74 ++++++++++---------
3 files changed, 42 insertions(+), 37 deletions(-)
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index a55b46eecee65..36f7892f0e564 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -74,7 +74,9 @@ using MemorySpaceMapping = std::function<unsigned(gpu::AddressSpace)>;
void populateGpuMemorySpaceAttributeConversions(
TypeConverter &typeConverter, const MemorySpaceMapping &mapping);
-/// TODO name this better
+/// Insert gpu.wait calls before gpu operations with multiple async dependencies
+/// when the gpu operation does not support multiple async dependencies, i.e.
+/// gpu.launch_func.
void populateGpuMultipleAsyncDepsConversionPatterns(
RewritePatternSet &patterns);
} // namespace mlir
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 70e3e45c225db..1fa918c94bcc3 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -552,6 +552,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
"runtime with the kernel bare pointer calling convention, to enable "
"dynamic binding of buffers as arguments without static type info."
>
+ // TODO should an option be made to turn this feature on?
];
let dependentDialects = [
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index fcef7b481a810..8cc0b7622e44d 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -546,19 +546,20 @@ void GpuToLLVMConversionPass::runOnOperation() {
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
return signalPassFailure();
}
- //
- if (true /*handle multiple async deps enabled*/) {
+ // Insert gpu.wait operations before operations that do not support multiple
+ // async dependencies.
+ // TODO should this only be enabled upon an option?
+ {
RewritePatternSet patternss(&getContext());
populateGpuMultipleAsyncDepsConversionPatterns(patternss);
if (failed(applyPatternsGreedily(getOperation(), std::move(patternss)))) {
return signalPassFailure();
}
- }
- // TODO wrap this in debug macros
- llvm::errs() << "--- IR After applyPatternsGreedily ---\n";
- getOperation()->print(llvm::errs());
- llvm::errs() << "--------------------------------------\n\n";
+ LLVM_DEBUG(llvm::dbgs() << "--- IR After Adding Additional gpu.waits: ---\n");
+ LLVM_DEBUG(getOperation()->print(llvm::dbgs()));
+ LLVM_DEBUG(llvm::dbgs() << "---------------------------------------------\n");
+ }
LowerToLLVMOptions options(context);
@@ -1822,9 +1823,9 @@ LogicalResult ConvertMultipleAsyncDepsToGpuWaitPattern<Op>::matchAndRewrite(
// TODO is it safe to just do getAsyncDependenciesMutable on the original op?
Operation *newOp = rewriter.clone(*op.getOperation());
- auto OpAdaptor = dyn_cast<Op>(newOp);
- assert(OpAdaptor && "Expected cloned op to have same type as original op.");
- OpAdaptor.getAsyncDependenciesMutable().assign({waitToken});
+ auto iface = dyn_cast<Op>(newOp);
+ assert(iface && "Expected cloned op to have same type as original op.");
+ iface.getAsyncDependenciesMutable().assign({waitToken});
rewriter.replaceOp(op, newOp);
return success();
@@ -1875,32 +1876,33 @@ void mlir::populateGpuToLLVMConversionPatterns(
void mlir::populateGpuMultipleAsyncDepsConversionPatterns(
RewritePatternSet &patterns) {
- // gpu::AllocOp,
- // gpu::DeallocOp,
- // gpu::MemcpyOp,
- // gpu::MemsetOp,
- // gpu::CreateDnTensorOp,
- // gpu::DestroyDnTensorOp,
- // gpu::CreateCooOp,
- // gpu::CreateCooAoSOp,
- // gpu::CreateCsrOp,
- // gpu::Create2To4SpMatOp,
- // gpu::DestroySpMatOp,
- // gpu::SpMVBufferSizeOp,
- // gpu::SpMVOp,
- // gpu::SpMMBufferSizeOp,
- // gpu::SDDMMBufferSizeOp,
- // gpu::SpMMOp,
- // gpu::SDDMMOp,
- // gpu::SpGEMMCreateDescrOp,
- // gpu::SpGEMMDestroyDescrOp,
- // gpu::SpGEMMWorkEstimationOrComputeOp,
- // gpu::SpGEMMCopyOp,
- // gpu::SpMatGetSizeOp,
- // gpu::SetCsrPointersOp,
- // gpu::CreateCscOp,
- // gpu::CreateBsrOp,
- // gpu::LaunchFuncOp
+ // TODO: Other ops to consider handling:
+ // - gpu::AllocOp,
+ // - gpu::DeallocOp,
+ // - gpu::MemcpyOp,
+ // - gpu::MemsetOp,
+ // - gpu::CreateDnTensorOp,
+ // - gpu::DestroyDnTensorOp,
+ // - gpu::CreateCooOp,
+ // - gpu::CreateCooAoSOp,
+ // - gpu::CreateCsrOp,
+ // - gpu::Create2To4SpMatOp,
+ // - gpu::DestroySpMatOp,
+ // - gpu::SpMVBufferSizeOp,
+ // - gpu::SpMVOp,
+ // - gpu::SpMMBufferSizeOp,
+ // - gpu::SDDMMBufferSizeOp,
+ // - gpu::SpMMOp,
+ // - gpu::SDDMMOp,
+ // - gpu::SpGEMMCreateDescrOp,
+ // - gpu::SpGEMMDestroyDescrOp,
+ // - gpu::SpGEMMWorkEstimationOrComputeOp,
+ // - gpu::SpGEMMCopyOp,
+ // - gpu::SpMatGetSizeOp,
+ // - gpu::SetCsrPointersOp,
+ // - gpu::CreateCscOp,
+ // - gpu::CreateBsrOp,
+ // - gpu::LaunchFuncOp
patterns.add<
ConvertMultipleAsyncDepsToGpuWaitPattern<gpu::LaunchFuncOp>
>(patterns.getContext());
>From f02c7525b252d080700ce04a96cca776e6ccbccb Mon Sep 17 00:00:00 2001
From: Ian Li <ian.li at intel.com>
Date: Tue, 28 Oct 2025 13:42:14 -0700
Subject: [PATCH 3/3] update test
---
.../GPUCommon/lower-multiple-async-deps.mlir | 71 ++++++++++++++++---
1 file changed, 61 insertions(+), 10 deletions(-)
diff --git a/mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir b/mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir
index 799a49280bbed..768f2aa1e0c6f 100644
--- a/mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-multiple-async-deps.mlir
@@ -2,29 +2,80 @@
module attributes {gpu.container_module} {
- // func.func @foo(%size : index) -> memref<?xf32> {
- // %t0 = gpu.wait async
- // %t1 = gpu.wait async [%t0]
- // %0 = gpu.alloc [%t0, %t1] (%size) : memref<?xf32>
- // // gpu.wait [%1]
- // return %0 : memref<?xf32>
- // }
-
gpu.module @foo {
gpu.func @bar() kernel {
gpu.return
}
}
+ // CHECK-LABEL: func @main
func.func @main() {
%c1 = arith.constant 1 : index
+ // Check that pass does not modify launch_func ops with only 1 dependency:
+
+ // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
%t0 = gpu.wait async
+ // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
%t1 = gpu.wait async [%t0]
- %token = gpu.launch_func async [%t0, %t1] @foo::@bar
+ // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+ %0 = gpu.wait async [%t0, %t1]
+ // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @foo::@bar blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64
+ %good_call = gpu.launch_func async [%0] @foo::@bar
+ blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+ // CHECK: llvm.call @mgpuStreamSynchronize(%{{.*}}) : (!llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+ gpu.wait [%good_call]
+
+ // Check that launch_func ops with multiple dependencies are properly
+ // handled and do not result in a failure:
+
+ // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ %t2 = gpu.wait async
+ // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+ %t3 = gpu.wait async [%t2]
+ // Inserted gpu.wait:
+ // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+ // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+ // gpu.launch_func only has 1 async dependency:
+ // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @foo::@bar blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64
+ %bad_call = gpu.launch_func async [%t2, %t3] @foo::@bar
blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
- gpu.wait [%token]
+ // CHECK: llvm.call @mgpuStreamSynchronize(%{{.*}}) : (!llvm.ptr) -> ()
+ // CHECK: llvm.call @mgpuStreamDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+ gpu.wait [%bad_call]
return
}
+ // func.func @foo(%size : index) -> memref<?xf32> {
+ // %t0 = gpu.wait async
+ // %t1 = gpu.wait async [%t0]
+ // %0 = gpu.alloc [%t0, %t1] (%size) : memref<?xf32>
+ // // gpu.wait [%1]
+ // return %0 : memref<?xf32>
+ // }
+
}
More information about the Mlir-commits
mailing list