[Openmp-commits] [openmp] [Libomptarget] Fix RPC-based malloc on NVPTX (PR #72440)
Joseph Huber via Openmp-commits
openmp-commits at lists.llvm.org
Tue Jan 2 14:36:27 PST 2024
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/72440
>From a2093c4a2989d0b962cd1bcf526d066664199e6d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 15 Nov 2023 07:22:41 -0600
Subject: [PATCH] [Libomptarget] Fix RPC-based `malloc` on NVPTX
Summary:
The device allocator on NVPTX architectures is enqueud to a stream that
the kernel is potentially executing on. This can lead to deadlocks as
the kernel will not proceed until the allocation is complete and the
allocation will not proceed until the kernel is complete. CUDA 11.2
introduced async allocations that we can manually place on separate
streams to combat this. This patch makes a new allocation type that's
guarunteed to be non-blocking so it will actually make progress, only
Nvidia needs to care about this as the others are not blocking in this
way by default.
I had originally tried to make the `alloc` and `free` methods take a
`__tgt_async_info`. However, I observed that with the large volume of
streams being created by a parallel test it quickly locked up the system
as presumably too many streams were being created. This implementation
not just creates a new stream and immediately destroys it. This
obviously isn't very fast, but it at least gets the cases to stop
deadlocking for now.
---
openmp/libomptarget/include/omptarget.h | 4 ++-
.../plugins-nextgen/amdgpu/src/rtl.cpp | 2 ++
.../plugins-nextgen/common/src/RPC.cpp | 7 +++--
.../cuda/dynamic_cuda/cuda.cpp | 3 +++
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 2 ++
.../plugins-nextgen/cuda/src/rtl.cpp | 27 +++++++++++++++++++
.../generic-elf-64bit/src/rtl.cpp | 1 +
openmp/libomptarget/test/libc/malloc.c | 10 ++++++-
8 files changed, 50 insertions(+), 6 deletions(-)
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 476a158019d3c5..d5602eec0d07c8 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -103,7 +103,9 @@ enum TargetAllocTy : int32_t {
TARGET_ALLOC_DEVICE = 0,
TARGET_ALLOC_HOST,
TARGET_ALLOC_SHARED,
- TARGET_ALLOC_DEFAULT
+ TARGET_ALLOC_DEFAULT,
+ /// The allocation will not block on other streams.
+ TARGET_ALLOC_DEVICE_NON_BLOCKING,
};
inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index fe435a3f558557..0411c670133422 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2112,6 +2112,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
@@ -3315,6 +3316,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
index 60e0540e960228..54aced11b31c3c 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
@@ -62,15 +62,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
"Failed to initialize RPC server for device %d: %d", DeviceId, Err);
// Register a custom opcode handler to perform plugin specific allocation.
- // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
auto MallocHandler = [](rpc_port_t Port, void *Data) {
rpc_recv_and_send(
Port,
[](rpc_buffer_t *Buffer, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- Buffer->data[0] = reinterpret_cast<uintptr_t>(
- Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+ Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+ Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
},
Data);
};
@@ -88,7 +87,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
- TARGET_ALLOC_DEVICE);
+ TARGET_ALLOC_DEVICE_NON_BLOCKING);
},
Data);
};
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 56c4404ac2d5c1..5ec3adb9e4e3a1 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -43,6 +43,7 @@ DLWRAP(cuLaunchKernel, 11)
DLWRAP(cuMemAlloc, 2)
DLWRAP(cuMemAllocHost, 2)
DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
DLWRAP(cuMemcpyDtoDAsync, 4)
DLWRAP(cuMemcpyDtoH, 3)
@@ -52,6 +53,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)
DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
DLWRAP(cuModuleGetFunction, 3)
DLWRAP(cuModuleGetGlobal, 4)
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e0307759924b2..32031c28f8797e 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
CUresult cuMemAlloc(CUdeviceptr *, size_t);
CUresult cuMemAllocHost(void **, size_t);
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index b0dff917dd0be0..0005bff7a8035d 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -63,6 +63,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option) {}
#endif
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
/// Class implementing the CUDA device images properties.
struct CUDADeviceImageTy : public DeviceImageTy {
/// Create the CUDA image with the id and the target image pointer.
@@ -488,6 +496,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
MemAlloc = (void *)DevicePtr;
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+ break;
+ if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
+ break;
+ cuStreamSynchronize(Stream);
+ Res = cuStreamDestroy(Stream);
+ MemAlloc = (void *)DevicePtr;
+ }
}
if (auto Err =
@@ -518,6 +536,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_HOST:
Res = cuMemFreeHost(TgtPtr);
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
+ break;
+ cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+ cuStreamSynchronize(Stream);
+ if ((Res = cuStreamDestroy(Stream)))
+ break;
+ }
}
if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 88b5236d31f482..43569f25055594 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -215,6 +215,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemAlloc = std::malloc(Size);
break;
}
diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
index c18a724930f41e..b587b618472e43 100644
--- a/openmp/libomptarget/test/libc/malloc.c
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -13,7 +13,7 @@ int main() {
unsigned *d_x;
#pragma omp target map(from : d_x)
{
- d_x = malloc(sizeof(unsigned));
+ d_x = (unsigned *)malloc(sizeof(unsigned));
*d_x = 1;
}
@@ -23,6 +23,14 @@ int main() {
#pragma omp target is_device_ptr(d_x)
{ free(d_x); }
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+ {
+ int *ptr = (int *)malloc(sizeof(int));
+ *ptr = 42;
+ free(ptr);
+ }
+
// CHECK: PASS
if (h_x == 1)
fputs("PASS\n", stdout);
More information about the Openmp-commits
mailing list