[Mlir-commits] [mlir] [MLIR][GPU] Support synchronous gpu.alloc and gpu.dealloc in gpu-to-llvm (PR #191661)
Jared Hoberock
llvmlistbot at llvm.org
Sun May 17 11:40:59 PDT 2026
https://github.com/jaredhoberock updated https://github.com/llvm/llvm-project/pull/191661
>From 49041ecdacd192162e127d0cbadf7a5adc979bbb Mon Sep 17 00:00:00 2001
From: Jared Hoberock <jaredhoberock at gmail.com>
Date: Sat, 11 Apr 2026 15:04:56 -0500
Subject: [PATCH] [MLIR][GPU] Support synchronous gpu.alloc and gpu.dealloc in
gpu-to-llvm
The gpu-to-llvm conversion patterns for gpu.alloc and gpu.dealloc
previously required async tokens for non-host-shared operations.
This prevented lowering synchronous device memory allocation and
deallocation to runtime calls.
Remove the async requirement:
- gpu.alloc: drop the isAsyncWithOneDependency guard for non-shared
allocs. The existing code already handles the sync case correctly
(null stream when no async dependencies). Cap the number of async
dependencies at one to preserve the prior single-dependency
invariant.
- gpu.dealloc: drop the async requirement entirely. Use a null
stream when no async dependencies are present. Use eraseOp instead
of replaceOp for sync deallocs (which have no results). Cap the
number of async dependencies at one.
- Add a unit test for the synchronous alloc/dealloc lowering and a
test that more than one async dependency leaves the op
unconverted.
Assisted-by: Claude
---
.../GPUCommon/GPUToLLVMConversion.cpp | 24 +++++++++---
.../lower-alloc-to-gpu-runtime-calls.mlir | 38 ++++++++++++++++++-
2 files changed, 55 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 3e99c537d0e02..540c9ce6fbe6c 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -770,8 +770,9 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
if (isShared && allocOp.getAsyncToken())
return rewriter.notifyMatchFailure(
allocOp, "Host Shared allocation cannot be done async");
- if (!isShared && failed(isAsyncWithOneDependency(rewriter, allocOp)))
- return failure();
+ if (adaptor.getAsyncDependencies().size() > 1)
+ return rewriter.notifyMatchFailure(
+ allocOp, "Can convert with at most one async dependency.");
// Get shape of the memref as values: static sizes are constant
// values and dynamic sizes are passed to 'alloc' as operands.
@@ -815,18 +816,29 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::DeallocOp deallocOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
- if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)) ||
- failed(isAsyncWithOneDependency(rewriter, deallocOp)))
+ if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)))
return failure();
+ if (adaptor.getAsyncDependencies().size() > 1)
+ return rewriter.notifyMatchFailure(
+ deallocOp, "Can convert with at most one async dependency.");
Location loc = deallocOp.getLoc();
Value pointer =
MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
- Value stream = adaptor.getAsyncDependencies().front();
+ auto nullPtr = mlir::LLVM::ZeroOp::create(rewriter, loc, llvmPointerType);
+ Value stream = adaptor.getAsyncDependencies().empty()
+ ? nullPtr
+ : adaptor.getAsyncDependencies().front();
deallocCallBuilder.create(loc, rewriter, {pointer, stream});
- rewriter.replaceOp(deallocOp, {stream});
+ if (deallocOp.getAsyncToken()) {
+ // Async dealloc: propagate the stream as the async token replacement.
+ rewriter.replaceOp(deallocOp, {stream});
+ } else {
+ // Sync dealloc: no results to replace, just remove the op.
+ rewriter.eraseOp(deallocOp);
+ }
return success();
}
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index ae8b7aaac7fd9..5d5c0093edf71 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm -split-input-file | FileCheck %s
module attributes {gpu.container_module} {
// CHECK-LABEL: llvm.func @main
@@ -20,6 +20,22 @@ module attributes {gpu.container_module} {
return
}
+ // CHECK-LABEL: llvm.func @alloc_dealloc_sync
+ // CHECK-SAME: %[[size:.*]]: i64
+ func.func @alloc_dealloc_sync(%size : index) {
+ // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
+ // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
+ // CHECK: %[[nullptr:.*]] = llvm.mlir.zero
+ // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+ // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[nullptr]], %[[isHostShared]])
+ %0 = gpu.alloc (%size) : memref<?xf32>
+ // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
+ // CHECK: %[[nullptr2:.*]] = llvm.mlir.zero
+ // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[nullptr2]])
+ gpu.dealloc %0 : memref<?xf32>
+ return
+ }
+
// CHECK-LABEL: llvm.func @alloc_sync
// CHECK-SAME: %[[size:.*]]: i64
func.func @alloc_sync(%size : index) {
@@ -38,3 +54,23 @@ module attributes {gpu.container_module} {
return
}
}
+
+// -----
+
+// More than one async dependency is not supported; the alloc and dealloc
+// should be left unconverted.
+module attributes {gpu.container_module} {
+ // CHECK-LABEL: func @multi_dep_unsupported
+ func.func @multi_dep_unsupported(%size : index) {
+ %t1 = gpu.wait async
+ %t2 = gpu.wait async
+ // CHECK: gpu.alloc async [{{.*}}, {{.*}}]
+ // CHECK-NOT: mgpuMemAlloc
+ %buf, %t3 = gpu.alloc async [%t1, %t2] (%size) : memref<?xf32>
+ // CHECK: gpu.dealloc async [{{.*}}, {{.*}}]
+ // CHECK-NOT: mgpuMemFree
+ %t4 = gpu.dealloc async [%t3, %t1] %buf : memref<?xf32>
+ gpu.wait [%t4]
+ return
+ }
+}
More information about the Mlir-commits
mailing list