[Mlir-commits] [mlir] [MLIR][GPU] Support synchronous gpu.alloc and gpu.dealloc in gpu-to-llvm (PR #191661)

Sun May 17 11:40:59 PDT 2026

https://github.com/jaredhoberock updated https://github.com/llvm/llvm-project/pull/191661

>From 49041ecdacd192162e127d0cbadf7a5adc979bbb Mon Sep 17 00:00:00 2001
From: Jared Hoberock <jaredhoberock at gmail.com>
Date: Sat, 11 Apr 2026 15:04:56 -0500
Subject: [PATCH] [MLIR][GPU] Support synchronous gpu.alloc and gpu.dealloc in
 gpu-to-llvm

The gpu-to-llvm conversion patterns for gpu.alloc and gpu.dealloc
previously required async tokens for non-host-shared operations.
This prevented lowering synchronous device memory allocation and
deallocation to runtime calls.

Remove the async requirement:
- gpu.alloc: drop the isAsyncWithOneDependency guard for non-shared
  allocs. The existing code already handles the sync case correctly
  (null stream when no async dependencies). Cap the number of async
  dependencies at one to preserve the prior single-dependency
  invariant.
- gpu.dealloc: drop the async requirement entirely. Use a null
  stream when no async dependencies are present. Use eraseOp instead
  of replaceOp for sync deallocs (which have no results). Cap the
  number of async dependencies at one.
- Add a unit test for the synchronous alloc/dealloc lowering and a
  test that more than one async dependency leaves the op
  unconverted.

Assisted-by: Claude
---
 .../GPUCommon/GPUToLLVMConversion.cpp         | 24 +++++++++---
 .../lower-alloc-to-gpu-runtime-calls.mlir     | 38 ++++++++++++++++++-
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 3e99c537d0e02..540c9ce6fbe6c 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -770,8 +770,9 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
   if (isShared && allocOp.getAsyncToken())
     return rewriter.notifyMatchFailure(
         allocOp, "Host Shared allocation cannot be done async");
-  if (!isShared && failed(isAsyncWithOneDependency(rewriter, allocOp)))
-    return failure();
+  if (adaptor.getAsyncDependencies().size() > 1)
+    return rewriter.notifyMatchFailure(
+        allocOp, "Can convert with at most one async dependency.");
 
   // Get shape of the memref as values: static sizes are constant
   // values and dynamic sizes are passed to 'alloc' as operands.
@@ -815,18 +816,29 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::DeallocOp deallocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
-  if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)) ||
-      failed(isAsyncWithOneDependency(rewriter, deallocOp)))
+  if (failed(areAllLLVMTypes(deallocOp, adaptor.getOperands(), rewriter)))
     return failure();
+  if (adaptor.getAsyncDependencies().size() > 1)
+    return rewriter.notifyMatchFailure(
+        deallocOp, "Can convert with at most one async dependency.");
 
   Location loc = deallocOp.getLoc();
 
   Value pointer =
       MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
-  Value stream = adaptor.getAsyncDependencies().front();
+  auto nullPtr = mlir::LLVM::ZeroOp::create(rewriter, loc, llvmPointerType);
+  Value stream = adaptor.getAsyncDependencies().empty()
+                     ? nullPtr
+                     : adaptor.getAsyncDependencies().front();
   deallocCallBuilder.create(loc, rewriter, {pointer, stream});
 
-  rewriter.replaceOp(deallocOp, {stream});
+  if (deallocOp.getAsyncToken()) {
+    // Async dealloc: propagate the stream as the async token replacement.
+    rewriter.replaceOp(deallocOp, {stream});
+  } else {
+    // Sync dealloc: no results to replace, just remove the op.
+    rewriter.eraseOp(deallocOp);
+  }
   return success();
 }
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index ae8b7aaac7fd9..5d5c0093edf71 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm -split-input-file | FileCheck %s
 
 module attributes {gpu.container_module} {
   // CHECK-LABEL: llvm.func @main
@@ -20,6 +20,22 @@ module attributes {gpu.container_module} {
     return
   }
 
+  // CHECK-LABEL: llvm.func @alloc_dealloc_sync
+  // CHECK-SAME: %[[size:.*]]: i64
+  func.func @alloc_dealloc_sync(%size : index) {
+    // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
+    // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
+    // CHECK: %[[nullptr:.*]] = llvm.mlir.zero
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[nullptr]], %[[isHostShared]])
+    %0 = gpu.alloc (%size) : memref<?xf32>
+    // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
+    // CHECK: %[[nullptr2:.*]] = llvm.mlir.zero
+    // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[nullptr2]])
+    gpu.dealloc %0 : memref<?xf32>
+    return
+  }
+
   // CHECK-LABEL: llvm.func @alloc_sync
   // CHECK-SAME: %[[size:.*]]: i64
   func.func @alloc_sync(%size : index) {
@@ -38,3 +54,23 @@ module attributes {gpu.container_module} {
     return
   }
 }
+
+// -----
+
+// More than one async dependency is not supported; the alloc and dealloc
+// should be left unconverted.
+module attributes {gpu.container_module} {
+  // CHECK-LABEL: func @multi_dep_unsupported
+  func.func @multi_dep_unsupported(%size : index) {
+    %t1 = gpu.wait async
+    %t2 = gpu.wait async
+    // CHECK: gpu.alloc async [{{.*}}, {{.*}}]
+    // CHECK-NOT: mgpuMemAlloc
+    %buf, %t3 = gpu.alloc async [%t1, %t2] (%size) : memref<?xf32>
+    // CHECK: gpu.dealloc async [{{.*}}, {{.*}}]
+    // CHECK-NOT: mgpuMemFree
+    %t4 = gpu.dealloc async [%t3, %t1] %buf : memref<?xf32>
+    gpu.wait [%t4]
+    return
+  }
+}