[Mlir-commits] [mlir] [mlir][gpu] Fix gpu.host_register lowering and runtime support (PR #170085)

Mon Dec 1 00:21:51 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-mlir-execution-engine

Author: Men-cotton (Men-cotton)

<details>
<summary>Changes</summary>


This PR fixes related issues with `gpu.host_register` to ensure correct behavior with bare pointers and multi-rank memrefs, and updates the op definition.

## 1. Fix bare pointer lowering
When `gpu-to-llvm` is configured with `use-bare-pointers-for-host=1`, `gpu.host_register` previously lowered to a runtime call passing the raw bare pointer. However, the runtime entry point `mgpuMemHostRegisterMemRef` expects a pointer to a memref descriptor struct. This PR updates the lowering in `GPUToLLVMConversion.cpp` to reconstruct a `MemRefDescriptor` from the bare pointer (using static shape information) before passing it to the runtime. This prevents crashes and ensures the runtime receives the expected descriptor format.

## 2. Fix runtime for multi-rank memrefs
Updates the CUDA and ROCm runtime wrappers (`CudaRuntimeWrappers.cpp`, `RocmRuntimeWrappers.cpp`) to correctly handle multi-rank memrefs in `mgpuMemHostRegisterMemRef`. This ensures that host registration works reliably for memrefs with `rank > 1`.

This part fixes: https://github.com/llvm/llvm-project/issues/56366

## 3. Update `gpu.host_register` signature
Updates `gpu.host_register` and `gpu.host_unregister` to accept `AnyMemRef` instead of `AnyUnrankedMemRef`. This removes the requirement to cast ranked memrefs to unranked before registration. SparseGPUCodegen.cpp is updated to remove these unnecessary casts.

---
Full diff: https://github.com/llvm/llvm-project/pull/170085.diff


8 Files Affected:

- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+2-2) 
- (modified) mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp (+60-14) 
- (modified) mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp (+6-10) 
- (modified) mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp (+21-11) 
- (modified) mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp (+24-18) 
- (added) mlir/test/Conversion/GPUCommon/host-register-bare-ptr-func.mlir (+9) 
- (added) mlir/test/Conversion/GPUCommon/host-register-bare-ptr.mlir (+54) 
- (added) mlir/test/Integration/GPU/CUDA/host-register-ranked-memref.mlir (+18) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index a6c6038e1e224..81e8801aae1f0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1566,7 +1566,7 @@ def GPU_BinaryOp : GPU_Op<"binary", [Symbol]>, Arguments<(ins
 }
 
 def GPU_HostRegisterOp : GPU_Op<"host_register">,
-    Arguments<(ins AnyUnrankedMemRef:$value)> {
+    Arguments<(ins AnyMemRef:$value)> {
   let summary = "Registers a memref for access from device.";
   let description = [{
     This op maps the provided host buffer into the device address space.
@@ -1583,7 +1583,7 @@ def GPU_HostRegisterOp : GPU_Op<"host_register">,
 }
 
 def GPU_HostUnregisterOp : GPU_Op<"host_unregister">,
-    Arguments<(ins AnyUnrankedMemRef:$value)> {
+    Arguments<(ins AnyMemRef:$value)> {
   let summary = "Unregisters a memref for access from device.";
   let description = [{
       This op unmaps the provided host buffer from the device address space.
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 5994b64f3d9a5..c34005893ed8c 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -709,6 +709,50 @@ isAsyncWithOneDependency(ConversionPatternRewriter &rewriter,
   return success();
 }
 
+static LogicalResult prepareHostRegisterUnregisterArguments(
+    Operation *op, Value value, Value adaptorValue,
+    const LLVMTypeConverter *typeConverter, ConversionPatternRewriter &rewriter,
+    SmallVectorImpl<Value> &arguments, SmallVectorImpl<Type> &elementTypes) {
+  Location loc = op->getLoc();
+  auto valueType = value.getType();
+
+  if (auto memRefType = dyn_cast<MemRefType>(valueType)) {
+    Type elementType = memRefType.getElementType();
+    elementTypes.push_back(elementType);
+    Type llvmIntPtrType = IntegerType::get(
+        rewriter.getContext(), typeConverter->getPointerBitwidth(0));
+    Value rank = rewriter.create<LLVM::ConstantOp>(
+        loc, llvmIntPtrType,
+        rewriter.getIntegerAttr(llvmIntPtrType, memRefType.getRank()));
+    Value descriptor = adaptorValue;
+    Value descriptorPtr;
+    bool useBarePtrCallConv = typeConverter->getOptions().useBarePtrCallConv;
+
+    if (useBarePtrCallConv) {
+      if (!LLVMTypeConverter::canConvertToBarePtr(memRefType)) {
+        return op->emitError(
+            "cannot lower memref with bare pointer calling convention");
+      }
+
+      if (isa<LLVM::LLVMPointerType>(descriptor.getType()))
+        descriptor = MemRefDescriptor::fromStaticShape(
+            rewriter, loc, *typeConverter, memRefType, descriptor);
+
+      descriptorPtr =
+          typeConverter->promoteOneMemRefDescriptor(loc, descriptor, rewriter);
+    } else {
+      descriptorPtr =
+          typeConverter->promoteOneMemRefDescriptor(loc, descriptor, rewriter);
+    }
+    arguments.push_back(rank);
+    arguments.push_back(descriptorPtr);
+  } else {
+    return rewriter.notifyMatchFailure(op, "expected memref operand");
+  }
+
+  return success();
+}
+
 LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::HostRegisterOp hostRegisterOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -716,14 +760,15 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
   if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
     return failure();
 
-  Location loc = op->getLoc();
-
-  auto memRefType = hostRegisterOp.getValue().getType();
-  auto elementType = cast<UnrankedMemRefType>(memRefType).getElementType();
-  auto elementSize = getSizeInBytes(loc, elementType, rewriter);
+  SmallVector<Value> arguments;
+  SmallVector<Type> elementTypes;
+  if (failed(prepareHostRegisterUnregisterArguments(
+          op, hostRegisterOp.getValue(), adaptor.getValue(), getTypeConverter(),
+          rewriter, arguments, elementTypes)))
+    return failure(); // Error already emitted or match failure notified
 
-  auto arguments = getTypeConverter()->promoteOperands(
-      loc, op->getOperands(), adaptor.getOperands(), rewriter);
+  Location loc = op->getLoc();
+  auto elementSize = getSizeInBytes(loc, elementTypes.front(), rewriter);
   arguments.push_back(elementSize);
   hostRegisterCallBuilder.create(loc, rewriter, arguments);
 
@@ -738,14 +783,15 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
   if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
     return failure();
 
-  Location loc = op->getLoc();
-
-  auto memRefType = hostUnregisterOp.getValue().getType();
-  auto elementType = cast<UnrankedMemRefType>(memRefType).getElementType();
-  auto elementSize = getSizeInBytes(loc, elementType, rewriter);
+  SmallVector<Value> arguments;
+  SmallVector<Type> elementTypes;
+  if (failed(prepareHostRegisterUnregisterArguments(
+          op, hostUnregisterOp.getValue(), adaptor.getValue(),
+          getTypeConverter(), rewriter, arguments, elementTypes)))
+    return failure(); // Error already emitted or match failure notified
 
-  auto arguments = getTypeConverter()->promoteOperands(
-      loc, op->getOperands(), adaptor.getOperands(), rewriter);
+  Location loc = op->getLoc();
+  auto elementSize = getSizeInBytes(loc, elementTypes.front(), rewriter);
   arguments.push_back(elementSize);
   hostUnregisterCallBuilder.create(loc, rewriter, arguments);
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
index 0bd1d34c3504b..4f6ddab0821fb 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -109,21 +109,17 @@ static Value genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc,
 /// Writes from the host are guaranteed to be visible to device kernels
 /// that are launched afterwards. Writes from the device are guaranteed
 /// to be visible on the host after synchronizing with the device kernel
-/// completion. Needs to cast the buffer to a unranked buffer.
+/// completion.
 static Value genHostRegisterMemref(OpBuilder &builder, Location loc,
                                    Value mem) {
-  MemRefType memTp = cast<MemRefType>(mem.getType());
-  UnrankedMemRefType resTp =
-      UnrankedMemRefType::get(memTp.getElementType(), /*memorySpace=*/0);
-  Value cast = memref::CastOp::create(builder, loc, resTp, mem);
-  gpu::HostRegisterOp::create(builder, loc, cast);
-  return cast;
+  gpu::HostRegisterOp::create(builder, loc, mem);
+  return mem;
 }
 
-/// Unmaps the provided buffer, expecting the casted buffer.
+/// Unmaps the provided buffer.
 static void genHostUnregisterMemref(OpBuilder &builder, Location loc,
-                                    Value cast) {
-  gpu::HostUnregisterOp::create(builder, loc, cast);
+                                    Value mem) {
+  gpu::HostUnregisterOp::create(builder, loc, mem);
 }
 
 /// Generates first wait in an asynchronous chain.
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index f203363e16ea2..b9aa78e22e109 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -284,31 +284,40 @@ mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
   CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0));
 }
 
-/// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a
-/// ranked memref descriptor struct of rank `rank`. Helpful until we have
+/// Registers a memref with the CUDA runtime. `descriptor` is a pointer to an
+/// unranked memref descriptor struct of rank `rank`. Helpful until we have
 /// transfer functions implemented.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
+mgpuMemHostRegisterMemRef(int64_t rank, void *descriptor,
                           int64_t elementSizeBytes) {
+  ::UnrankedMemRefType<char> unranked{rank, descriptor};
+  DynamicMemRefType<char> memRef(unranked);
+
+  // Rank-0 memref: single element.
+  if (rank == 0) {
+    auto *ptr = memRef.data + memRef.offset * elementSizeBytes;
+    mgpuMemHostRegister(ptr, elementSizeBytes);
+    return;
+  }
+
   // Only densely packed tensors are currently supported.
 #ifdef _WIN32
   int64_t *denseStrides = (int64_t *)_alloca(rank * sizeof(int64_t));
 #else
   int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t));
 #endif // _WIN32
-  int64_t *sizes = descriptor->sizes;
+  const int64_t *sizes = memRef.sizes;
   for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) {
     denseStrides[i] = runningStride;
     runningStride *= sizes[i];
   }
   uint64_t sizeBytes = sizes[0] * denseStrides[0] * elementSizeBytes;
-  int64_t *strides = &sizes[rank];
-  (void)strides;
-  for (unsigned i = 0; i < rank; ++i)
+  const int64_t *strides = memRef.strides;
+  for (int64_t i = 0; i < rank; ++i)
     assert(strides[i] == denseStrides[i] &&
            "Mismatch in computed dense strides");
 
-  auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  auto *ptr = memRef.data + memRef.offset * elementSizeBytes;
   mgpuMemHostRegister(ptr, sizeBytes);
 }
 
@@ -321,10 +330,11 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
 /// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
 /// ranked memref descriptor struct of rank `rank`
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuMemHostUnregisterMemRef(int64_t rank,
-                            StridedMemRefType<char, 1> *descriptor,
+mgpuMemHostUnregisterMemRef(int64_t rank, void *descriptor,
                             int64_t elementSizeBytes) {
-  auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  ::UnrankedMemRefType<char> unranked{rank, descriptor};
+  DynamicMemRefType<char> memRef(unranked);
+  auto *ptr = memRef.data + memRef.offset * elementSizeBytes;
   mgpuMemHostUnregister(ptr);
 }
 
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index b984149ca6dea..e4267273b20c3 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <cassert>
-#include <numeric>
 
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -143,25 +142,32 @@ extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
 
 // Allows to register a MemRef with the ROCm runtime. Helpful until we have
 // transfer functions implemented.
-extern "C" void
-mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
-                          int64_t elementSizeBytes) {
+extern "C" void mgpuMemHostRegisterMemRef(int64_t rank, void *descriptor,
+                                          int64_t elementSizeBytes) {
+  ::UnrankedMemRefType<char> unranked{rank, descriptor};
+  DynamicMemRefType<char> memRef(unranked);
+
+  // Rank-0 memref: single element.
+  if (rank == 0) {
+    auto ptr = memRef.data + memRef.offset * elementSizeBytes;
+    mgpuMemHostRegister(ptr, elementSizeBytes);
+    return;
+  }
 
   llvm::SmallVector<int64_t, 4> denseStrides(rank);
-  llvm::ArrayRef<int64_t> sizes(descriptor->sizes, rank);
-  llvm::ArrayRef<int64_t> strides(sizes.end(), rank);
+  llvm::ArrayRef<int64_t> sizes(memRef.sizes, rank);
+  llvm::ArrayRef<int64_t> strides(memRef.strides, rank);
 
-  std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
-                   std::multiplies<int64_t>());
-  auto sizeBytes = denseStrides.front() * elementSizeBytes;
+  for (int64_t i = rank - 1, runningStride = 1; i >= 0; --i) {
+    denseStrides[i] = runningStride;
+    runningStride *= sizes[i];
+  }
+  auto sizeBytes = sizes.front() * denseStrides.front() * elementSizeBytes;
 
   // Only densely packed tensors are currently supported.
-  std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
-              denseStrides.end());
-  denseStrides.back() = 1;
   assert(strides == llvm::ArrayRef(denseStrides));
 
-  auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  auto ptr = memRef.data + memRef.offset * elementSizeBytes;
   mgpuMemHostRegister(ptr, sizeBytes);
 }
 
@@ -173,11 +179,11 @@ extern "C" void mgpuMemHostUnregister(void *ptr) {
 
 // Allows to unregister a MemRef with the ROCm runtime. Helpful until we have
 // transfer functions implemented.
-extern "C" void
-mgpuMemHostUnregisterMemRef(int64_t rank,
-                            StridedMemRefType<char, 1> *descriptor,
-                            int64_t elementSizeBytes) {
-  auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+extern "C" void mgpuMemHostUnregisterMemRef(int64_t rank, void *descriptor,
+                                            int64_t elementSizeBytes) {
+  ::UnrankedMemRefType<char> unranked{rank, descriptor};
+  DynamicMemRefType<char> memRef(unranked);
+  auto ptr = memRef.data + memRef.offset * elementSizeBytes;
   mgpuMemHostUnregister(ptr);
 }
 
diff --git a/mlir/test/Conversion/GPUCommon/host-register-bare-ptr-func.mlir b/mlir/test/Conversion/GPUCommon/host-register-bare-ptr-func.mlir
new file mode 100644
index 0000000000000..8b5c4c4f55d1e
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/host-register-bare-ptr-func.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s --gpu-to-llvm="use-bare-pointers-for-host=1" -split-input-file -verify-diagnostics
+
+module attributes {gpu.container_module} {
+  func.func @dynamic(%buf : memref<?xf32>) {
+    // expected-error @+1 {{cannot lower memref with bare pointer calling convention}}
+    gpu.host_register %buf : memref<?xf32>
+    return
+  }
+}
diff --git a/mlir/test/Conversion/GPUCommon/host-register-bare-ptr.mlir b/mlir/test/Conversion/GPUCommon/host-register-bare-ptr.mlir
new file mode 100644
index 0000000000000..e8013f86766d8
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/host-register-bare-ptr.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt %s --gpu-to-llvm="use-bare-pointers-for-host=1" -split-input-file -verify-diagnostics | FileCheck %s --check-prefix=BARE
+
+module attributes {gpu.container_module} {
+  func.func @host_register(%arg0: memref<4x6xf16>) {
+    gpu.host_register %arg0 : memref<4x6xf16>
+    gpu.host_unregister %arg0 : memref<4x6xf16>
+    return
+  }
+}
+
+// BARE-LABEL: llvm.func @host_register
+// BARE-SAME: ({{.*}}: !llvm.ptr) {
+// BARE: %[[DESC0:.+]] = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+// BARE: %[[DESC1:.+]] = llvm.insertvalue %arg0, %[[DESC0]][0]
+// BARE: %[[DESC2:.+]] = llvm.insertvalue %arg0, %[[DESC1]][1]
+// BARE: %[[OFF:.+]] = llvm.mlir.constant(0 : {{.*}}) : i64
+// BARE: %[[DESC3:.+]] = llvm.insertvalue %[[OFF]], %[[DESC2]][2]
+// BARE: %[[SIZE0:.+]] = llvm.mlir.constant(4 : {{.*}}) : i64
+// BARE: %[[DESC4:.+]] = llvm.insertvalue %[[SIZE0]], %[[DESC3]][3, 0]
+// BARE: %[[STRIDE0:.+]] = llvm.mlir.constant(6 : {{.*}}) : i64
+// BARE: %[[DESC5:.+]] = llvm.insertvalue %[[STRIDE0]], %[[DESC4]][4, 0]
+// BARE: %[[SIZE1:.+]] = llvm.mlir.constant(6 : {{.*}}) : i64
+// BARE: %[[DESC6:.+]] = llvm.insertvalue %[[SIZE1]], %[[DESC5]][3, 1]
+// BARE: %[[STRIDE1:.+]] = llvm.mlir.constant(1 : {{.*}}) : i64
+// BARE: %[[DESC7:.+]] = llvm.insertvalue %[[STRIDE1]], %[[DESC6]][4, 1]
+// BARE: %[[RANK:.+]] = llvm.mlir.constant(2 : {{.*}}) : i64
+// BARE: %[[ALLOCA:.+]] = llvm.alloca %{{.*}} x !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+// BARE: llvm.store %[[DESC7]], %[[ALLOCA]] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>, !llvm.ptr
+// BARE: %[[NULL:.+]] = llvm.mlir.zero : !llvm.ptr
+// BARE: %[[GEP:.+]] = llvm.getelementptr %[[NULL]][1] : (!llvm.ptr) -> !llvm.ptr, f16
+// BARE: %[[ELTSZ:.+]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// BARE: llvm.call @mgpuMemHostRegisterMemRef(%[[RANK]], %[[ALLOCA]], %[[ELTSZ]])
+// BARE: llvm.call @mgpuMemHostUnregisterMemRef(%{{.*}}, %{{.*}}, %{{.*}})
+
+// -----
+
+module attributes {gpu.container_module} {
+  func.func @dynamic(%n: index) {
+    %buf = memref.alloc(%n) : memref<?xf32>
+    // expected-error @+1 {{cannot lower memref with bare pointer calling convention}}
+    gpu.host_register %buf : memref<?xf32>
+    return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  func.func @unranked(%arg0: memref<*xf32>) {
+    // expected-error @+1 {{custom op 'gpu.host_register' invalid kind of type specified: expected builtin.memref, but found 'memref<*xf32>'}}
+    gpu.host_register %arg0 : memref<*xf32>
+    return
+  }
+}
diff --git a/mlir/test/Integration/GPU/CUDA/host-register-ranked-memref.mlir b/mlir/test/Integration/GPU/CUDA/host-register-ranked-memref.mlir
new file mode 100644
index 0000000000000..3195ffb8b6f2d
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/host-register-ranked-memref.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void
+
+module attributes {gpu.container_module} {
+  func.func @main() {
+    %0 = memref.alloc() : memref<64x64xf32>
+
+    // Call host_register with a rank-2 memref.
+    gpu.host_register %0 : memref<64x64xf32>
+
+    memref.dealloc %0 : memref<64x64xf32>
+    return
+  }
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/170085