[Openmp-commits] [openmp] a15f858 - [libomptarget] Add support for target memory allocators to cuda RTL
via Openmp-commits
openmp-commits at lists.llvm.org
Fri May 7 07:29:38 PDT 2021
Author: Joseph Huber
Date: 2021-05-07T10:27:02-04:00
New Revision: a15f8589f4e81973b096a5ccc7b5b687c3284ebe
URL: https://github.com/llvm/llvm-project/commit/a15f8589f4e81973b096a5ccc7b5b687c3284ebe
DIFF: https://github.com/llvm/llvm-project/commit/a15f8589f4e81973b096a5ccc7b5b687c3284ebe.diff
LOG: [libomptarget] Add support for target memory allocators to cuda RTL
Summary:
The allocator interface added in D97883 allows the RTL to allocate shared and
host-pinned memory from the cuda plugin. This patch adds support for these to
the runtime.
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D102000
Added:
openmp/libomptarget/test/api/omp_device_managed_memory.c
openmp/libomptarget/test/api/omp_host_pinned_memory.c
Modified:
openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
openmp/libomptarget/plugins/cuda/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
index 6e00728a658f7..f4b63f301c204 100644
--- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
+++ b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
@@ -31,7 +31,7 @@ class DeviceAllocatorTy {
/// Allocate a memory of size \p Size . \p HstPtr is used to assist the
/// allocation.
- virtual void *allocate(size_t Size, void *HstPtr) = 0;
+ virtual void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) = 0;
/// Delete the pointer \p TgtPtr on the device
virtual int free(void *TgtPtr) = 0;
@@ -133,7 +133,7 @@ class MemoryManagerTy {
/// Request memory from target device
void *allocateOnDevice(size_t Size, void *HstPtr) const {
- return DeviceAllocator.allocate(Size, HstPtr);
+ return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE);
}
/// Deallocate data on device
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 7834f771dd97b..e8fe63736b939 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -17,6 +17,7 @@
#include <memory>
#include <mutex>
#include <string>
+#include <unordered_map>
#include <vector>
#include "Debug.h"
@@ -297,12 +298,13 @@ class DeviceRTLTy {
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
const int DeviceId;
const std::vector<DeviceDataTy> &DeviceData;
+ std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
public:
CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
: DeviceId(DeviceId), DeviceData(DeviceData) {}
- void *allocate(size_t Size, void *) override {
+ void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
if (Size == 0)
return nullptr;
@@ -310,12 +312,34 @@ class DeviceRTLTy {
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return nullptr;
- CUdeviceptr DevicePtr;
- Err = cuMemAlloc(&DevicePtr, Size);
- if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
- return nullptr;
+ void *MemAlloc = nullptr;
+ switch (Kind) {
+ case TARGET_ALLOC_DEFAULT:
+ case TARGET_ALLOC_DEVICE:
+ CUdeviceptr DevicePtr;
+ Err = cuMemAlloc(&DevicePtr, Size);
+ MemAlloc = (void *)DevicePtr;
+ if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
+ return nullptr;
+ break;
+ case TARGET_ALLOC_HOST:
+ void *HostPtr;
+ Err = cuMemAllocHost(&HostPtr, Size);
+ MemAlloc = HostPtr;
+ if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
+ return nullptr;
+ HostPinnedAllocs[MemAlloc] = Kind;
+ break;
+ case TARGET_ALLOC_SHARED:
+ CUdeviceptr SharedPtr;
+ Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
+ MemAlloc = (void *)SharedPtr;
+ if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
+ return nullptr;
+ break;
+ }
- return (void *)DevicePtr;
+ return MemAlloc;
}
int free(void *TgtPtr) override {
@@ -323,9 +347,25 @@ class DeviceRTLTy {
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
return OFFLOAD_FAIL;
- Err = cuMemFree((CUdeviceptr)TgtPtr);
- if (!checkResult(Err, "Error returned from cuMemFree\n"))
- return OFFLOAD_FAIL;
+ // Host pinned memory must be freed
diff erently.
+ TargetAllocTy Kind =
+ (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
+ ? TARGET_ALLOC_DEFAULT
+ : TARGET_ALLOC_HOST;
+ switch (Kind) {
+ case TARGET_ALLOC_DEFAULT:
+ case TARGET_ALLOC_DEVICE:
+ case TARGET_ALLOC_SHARED:
+ Err = cuMemFree((CUdeviceptr)TgtPtr);
+ if (!checkResult(Err, "Error returned from cuMemFree\n"))
+ return OFFLOAD_FAIL;
+ break;
+ case TARGET_ALLOC_HOST:
+ Err = cuMemFreeHost(TgtPtr);
+ if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
+ return OFFLOAD_FAIL;
+ break;
+ }
return OFFLOAD_SUCCESS;
}
@@ -804,11 +844,24 @@ class DeviceRTLTy {
return getOffloadEntriesTable(DeviceId);
}
- void *dataAlloc(const int DeviceId, const int64_t Size) {
- if (UseMemoryManager)
- return MemoryManagers[DeviceId]->allocate(Size, nullptr);
+ void *dataAlloc(const int DeviceId, const int64_t Size,
+ const TargetAllocTy Kind) {
+ switch (Kind) {
+ case TARGET_ALLOC_DEFAULT:
+ case TARGET_ALLOC_DEVICE:
+ if (UseMemoryManager)
+ return MemoryManagers[DeviceId]->allocate(Size, nullptr);
+ else
+ return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
+ case TARGET_ALLOC_HOST:
+ case TARGET_ALLOC_SHARED:
+ return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
+ }
- return DeviceAllocators[DeviceId].allocate(Size, nullptr);
+ REPORT("Invalid target data allocation kind or requested allocator not "
+ "implemented yet\n");
+
+ return nullptr;
}
int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
@@ -1097,13 +1150,7 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
int32_t kind) {
assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
- if (kind != TARGET_ALLOC_DEFAULT) {
- REPORT("Invalid target data allocation kind or requested allocator not "
- "implemented yet\n");
- return NULL;
- }
-
- return DeviceRTL.dataAlloc(device_id, size);
+ return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
}
int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
diff --git a/openmp/libomptarget/test/api/omp_device_managed_memory.c b/openmp/libomptarget/test/api/omp_device_managed_memory.c
new file mode 100644
index 0000000000000..e3114d85894f1
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_device_managed_memory.c
@@ -0,0 +1,29 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+void *llvm_omp_target_alloc_shared(size_t, int);
+
+int main() {
+ const int N = 64;
+ const int device = omp_get_default_device();
+
+ int *shared_ptr = llvm_omp_target_alloc_shared(N * sizeof(int), device);
+
+#pragma omp target teams distribute parallel for device(device) \
+ is_device_ptr(shared_ptr)
+ for (int i = 0; i < N; ++i) {
+ shared_ptr[i] = 1;
+ }
+
+ int sum = 0;
+ for (int i = 0; i < N; ++i)
+ sum += shared_ptr[i];
+
+ omp_target_free(shared_ptr, device);
+ // CHECK: PASS
+ if (sum == N)
+ printf ("PASS\n");
+}
diff --git a/openmp/libomptarget/test/api/omp_host_pinned_memory.c b/openmp/libomptarget/test/api/omp_host_pinned_memory.c
new file mode 100644
index 0000000000000..63f311851e85d
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_host_pinned_memory.c
@@ -0,0 +1,33 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+// Allocate pinned memory on the host
+void *llvm_omp_target_alloc_host(size_t, int);
+
+int main() {
+ const int N = 64;
+ const int device = omp_get_default_device();
+ const int host = omp_get_initial_device();
+
+ int *hst_ptr = llvm_omp_target_alloc_host(N * sizeof(int), device);
+
+ for (int i = 0; i < N; ++i)
+ hst_ptr[i] = 2;
+
+#pragma omp target teams distribute parallel for device(device) \
+ map(tofrom:hst_ptr[0 : N])
+ for (int i = 0; i < N; ++i)
+ hst_ptr[i] -= 1;
+
+ int sum = 0;
+ for (int i = 0; i < N; ++i)
+ sum += hst_ptr[i];
+
+ omp_target_free(hst_ptr, device);
+ // CHECK: PASS
+ if (sum == N)
+ printf ("PASS\n");
+}
More information about the Openmp-commits
mailing list