[Mlir-commits] [mlir] 8f7c8a6 - Add gpu::HostUnregisterOp
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Apr 6 13:13:35 PDT 2023
Author: max
Date: 2023-04-06T15:07:12-05:00
New Revision: 8f7c8a6ea765139225878e1dfe90bc1eb6f0067c
URL: https://github.com/llvm/llvm-project/commit/8f7c8a6ea765139225878e1dfe90bc1eb6f0067c
DIFF: https://github.com/llvm/llvm-project/commit/8f7c8a6ea765139225878e1dfe90bc1eb6f0067c.diff
LOG: Add gpu::HostUnregisterOp
Without explicitly unregistering you will get
```
'cuMemHostRegister(ptr, sizeBytes, 0)' failed with 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED'
```
in CUDA (for example) after repeated runs (e.g., during benchmarking the same kernel).
Reviewed By: ftynse
Differential Revision: https://reviews.llvm.org/D147277
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 32ab246c74f05..860e20720afd9 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -929,6 +929,19 @@ def GPU_HostRegisterOp : GPU_Op<"host_register">,
let assemblyFormat = "$value attr-dict `:` type($value)";
}
+def GPU_HostUnregisterOp : GPU_Op<"host_unregister">,
+ Arguments<(ins AnyUnrankedMemRef:$value)> {
+ let summary = "Unregisters a memref for access from device.";
+ let description = [{
+ This op unmaps the provided host buffer from the device address space.
+
+ This operation may not be supported in every environment, there is not yet a
+ way to check at runtime whether this feature is supported.
+ }];
+
+ let assemblyFormat = "$value attr-dict `:` type($value)";
+}
+
def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
let summary = "Wait for async gpu ops to complete.";
let description = [{
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 55a5e46839558..3687bd6718bf1 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -161,6 +161,12 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
{llvmIntPtrType /* intptr_t rank */,
llvmPointerType /* void *memrefDesc */,
llvmIntPtrType /* intptr_t elementSizeBytes */}};
+ FunctionCallBuilder hostUnregisterCallBuilder = {
+ "mgpuMemHostUnregisterMemRef",
+ llvmVoidType,
+ {llvmIntPtrType /* intptr_t rank */,
+ llvmPointerType /* void *memrefDesc */,
+ llvmIntPtrType /* intptr_t elementSizeBytes */}};
FunctionCallBuilder allocCallBuilder = {
"mgpuMemAlloc",
llvmPointerType /* void * */,
@@ -202,6 +208,20 @@ class ConvertHostRegisterOpToGpuRuntimeCallPattern
ConversionPatternRewriter &rewriter) const override;
};
+class ConvertHostUnregisterOpToGpuRuntimeCallPattern
+ : public ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp> {
+public:
+ ConvertHostUnregisterOpToGpuRuntimeCallPattern(
+ LLVMTypeConverter &typeConverter)
+ : ConvertOpToGpuRuntimeCallPattern<gpu::HostUnregisterOp>(typeConverter) {
+ }
+
+private:
+ LogicalResult
+ matchAndRewrite(gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+
/// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertAllocOpToGpuRuntimeCallPattern
@@ -446,6 +466,28 @@ LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
+LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
+ gpu::HostUnregisterOp hostUnregisterOp, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ Operation *op = hostUnregisterOp.getOperation();
+ if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
+ return failure();
+
+ Location loc = op->getLoc();
+
+ auto memRefType = hostUnregisterOp.getValue().getType();
+ auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
+ auto elementSize = getSizeInBytes(loc, elementType, rewriter);
+
+ auto arguments = getTypeConverter()->promoteOperands(
+ loc, op->getOperands(), adaptor.getOperands(), rewriter);
+ arguments.push_back(elementSize);
+ hostUnregisterCallBuilder.create(loc, rewriter, arguments);
+
+ rewriter.eraseOp(op);
+ return success();
+}
+
LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::AllocOp allocOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
@@ -928,6 +970,7 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,
+ ConvertHostUnregisterOpToGpuRuntimeCallPattern,
ConvertMemcpyOpToGpuRuntimeCallPattern,
ConvertMemsetOpToGpuRuntimeCallPattern,
ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 44ed5b0cd2057..4065c65316698 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -192,6 +192,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
mgpuMemHostRegister(ptr, sizeBytes);
}
+// Allows to unregister byte array with the CUDA runtime.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemHostUnregister(void *ptr) {
+ ScopedContext scopedContext;
+ CUDA_REPORT_IF_ERROR(cuMemHostUnregister(ptr));
+}
+
+/// Unregisters a memref with the CUDA runtime. `descriptor` is a pointer to a
+/// ranked memref descriptor struct of rank `rank`
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+ StridedMemRefType<char, 1> *descriptor,
+ int64_t elementSizeBytes) {
+ auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+ mgpuMemHostUnregister(ptr);
+}
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
defaultDevice = device;
}
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index 43a7e3c620890..bd3868a8e196f 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -152,6 +152,22 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
mgpuMemHostRegister(ptr, sizeBytes);
}
+// Allows to unregister byte array with the ROCM runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mgpuMemHostUnregister(void *ptr) {
+ HIP_REPORT_IF_ERROR(hipHostUnregister(ptr));
+}
+
+// Allows to unregister a MemRef with the ROCm runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void
+mgpuMemHostUnregisterMemRef(int64_t rank,
+ StridedMemRefType<char, 1> *descriptor,
+ int64_t elementSizeBytes) {
+ auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+ mgpuMemHostUnregister(ptr);
+}
+
template <typename T>
void mgpuMemGetDevicePointer(T *hostPtr, T **devicePtr) {
HIP_REPORT_IF_ERROR(hipSetDevice(0));
More information about the Mlir-commits
mailing list