[Mlir-commits] [mlir] [mlir][gpu] Use alloc OP's `host_shared` in cuda runtime (PR #99035)
Guray Ozen
llvmlistbot at llvm.org
Tue Jul 16 06:21:18 PDT 2024
https://github.com/grypp created https://github.com/llvm/llvm-project/pull/99035
`host_shared` on `gpu.alloc` means the memory will be avaiable on host and device. This means managed memory in the nvidia side. However, `host_shared` is unused in the runtime. This PR uses it to call cuMemAllocManaged.
>From 232d5edfbab8377f1d2add2491c4f84dd024d556 Mon Sep 17 00:00:00 2001
From: Guray Ozen <gozen at nvidia.com>
Date: Tue, 16 Jul 2024 15:20:09 +0200
Subject: [PATCH] [mlir][gpu] Use alloc OP's `host_shared` in cuda runtime
`host_shared` on `gpu.alloc` means the memory will be avaiable on host and device. This means managed memory in the nvidia side. However, `host_shared` is unused in the runtime. This PR uses it to call cuMemAllocManaged.
---
.../ExecutionEngine/CudaRuntimeWrappers.cpp | 13 ++++++---
.../GPU/CUDA/alloc-host-shared.mlir | 27 +++++++++++++++++++
2 files changed, 37 insertions(+), 3 deletions(-)
create mode 100644 mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 09dc30365e37c..6a32309aa9e05 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -237,11 +237,18 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event,
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
-mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, bool /*isHostShared*/) {
+mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) {
ScopedContext scopedContext;
CUdeviceptr ptr = 0;
- if (sizeBytes != 0)
- CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
+ if (sizeBytes == 0)
+ return reinterpret_cast<void *>(ptr);
+
+ if (isHostShared) {
+ CUDA_REPORT_IF_ERROR(
+ cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL));
+ return reinterpret_cast<void *>(ptr);
+ }
+ CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
return reinterpret_cast<void *>(ptr);
}
diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
new file mode 100644
index 0000000000000..77fa0deffdd69
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-cpu-runner \
+// RUN: --shared-libs=%mlir_cuda_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+// CHECK: 2000
+module attributes {gpu.container_module} {
+ func.func @main() {
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %c1000_i32 = arith.constant 1000 : i32
+ %memref = gpu.alloc host_shared () : memref<1xi32>
+ memref.store %c1000_i32, %memref[%c1] : memref<1xi32>
+ gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
+ %1 = memref.load %memref[%c1] : memref<1xi32>
+ %2 = arith.addi %1, %1 : i32
+ memref.store %2, %memref[%c1] : memref<1xi32>
+ gpu.terminator
+ }
+ %0 = memref.load %memref[%c1] : memref<1xi32>
+ vector.print %0 : i32
+ return
+ }
+}
More information about the Mlir-commits
mailing list