[Openmp-commits] [openmp] AsyncMalloc (PR #72440)
via Openmp-commits
openmp-commits at lists.llvm.org
Wed Nov 15 13:10:43 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
- WIP Async Malloc for Nvidia
- [Libomptarget] Fix RPC-based `malloc` on NVPTX
---
Full diff: https://github.com/llvm/llvm-project/pull/72440.diff
8 Files Affected:
- (modified) openmp/libomptarget/include/omptarget.h (+3-1)
- (modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+2)
- (modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp (+3-4)
- (modified) openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp (+3)
- (modified) openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h (+2)
- (modified) openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp (+25)
- (modified) openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp (+1)
- (modified) openmp/libomptarget/test/libc/malloc.c (+9-1)
``````````diff
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 818967c88904ec0..745f6d928476990 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -116,7 +116,9 @@ enum TargetAllocTy : int32_t {
TARGET_ALLOC_DEVICE = 0,
TARGET_ALLOC_HOST,
TARGET_ALLOC_SHARED,
- TARGET_ALLOC_DEFAULT
+ TARGET_ALLOC_DEFAULT,
+ /// The allocation will not block on other streams.
+ TARGET_ALLOC_DEVICE_NON_BLOCKING,
};
/// This struct contains all of the arguments to a target kernel region launch.
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e904..d7141726bffa5d6 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2064,6 +2064,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
@@ -3247,6 +3248,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
index 72bba012fcf93c6..27a5a6324ceb429 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
@@ -67,15 +67,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
"Failed to initialize RPC server for device %d: %d", DeviceId, Err);
// Register a custom opcode handler to perform plugin specific allocation.
- // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
auto MallocHandler = [](rpc_port_t Port, void *Data) {
rpc_recv_and_send(
Port,
[](rpc_buffer_t *Buffer, void *Data) {
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- Buffer->data[0] = reinterpret_cast<uintptr_t>(
- Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+ Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+ Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
},
Data);
};
@@ -93,7 +92,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericDeviceTy &Device =
*reinterpret_cast<plugin::GenericDeviceTy *>(Data);
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
- TARGET_ALLOC_DEVICE);
+ TARGET_ALLOC_DEVICE_NON_BLOCKING);
},
Data);
};
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 3d0de0d5b2caff6..e968ec712921641 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -42,6 +42,7 @@ DLWRAP(cuLaunchKernel, 11)
DLWRAP(cuMemAlloc, 2)
DLWRAP(cuMemAllocHost, 2)
DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
DLWRAP(cuMemcpyDtoDAsync, 4)
DLWRAP(cuMemcpyDtoH, 3)
@@ -51,6 +52,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)
DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
DLWRAP(cuModuleGetFunction, 3)
DLWRAP(cuModuleGetGlobal, 4)
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e0307759924b21..32031c28f8797ed 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
CUresult cuMemAlloc(CUdeviceptr *, size_t);
CUresult cuMemAllocHost(void **, size_t);
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index a6e28574a7f08e3..0ee46cd64a64ebf 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -61,6 +61,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option) {}
#endif
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
/// Class implementing the CUDA device images properties.
struct CUDADeviceImageTy : public DeviceImageTy {
/// Create the CUDA image with the id and the target image pointer.
@@ -486,6 +494,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
MemAlloc = (void *)DevicePtr;
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if (Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))
+ break;
+ if (Res = cuMemAllocAsync(&DevicePtr, Size, Stream))
+ break;
+ MemAlloc = (void *)DevicePtr;
+ Res = cuStreamDestroy(Stream);
+ }
}
if (auto Err =
@@ -516,6 +533,14 @@ struct CUDADeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_HOST:
Res = cuMemFreeHost(TgtPtr);
break;
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+ CUstream Stream;
+ if (Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))
+ break;
+ cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+ if (Res = cuStreamDestroy(Stream))
+ break;
+ }
}
if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 85cf9bef1543b2a..66937aa3e10bb4d 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -214,6 +214,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
+ case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemAlloc = std::malloc(Size);
break;
}
diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
index c18a724930f41e5..b587b618472e430 100644
--- a/openmp/libomptarget/test/libc/malloc.c
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -13,7 +13,7 @@ int main() {
unsigned *d_x;
#pragma omp target map(from : d_x)
{
- d_x = malloc(sizeof(unsigned));
+ d_x = (unsigned *)malloc(sizeof(unsigned));
*d_x = 1;
}
@@ -23,6 +23,14 @@ int main() {
#pragma omp target is_device_ptr(d_x)
{ free(d_x); }
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+ {
+ int *ptr = (int *)malloc(sizeof(int));
+ *ptr = 42;
+ free(ptr);
+ }
+
// CHECK: PASS
if (h_x == 1)
fputs("PASS\n", stdout);
``````````
</details>
https://github.com/llvm/llvm-project/pull/72440
More information about the Openmp-commits
mailing list