[Openmp-commits] [openmp] [OpenMP][libomptarget][RFC] extend libomptarget with mechanism to execute fill memory on the target (PR #73801)
Michael Klemm via Openmp-commits
openmp-commits at lists.llvm.org
Wed Nov 29 07:00:26 PST 2023
https://github.com/mjklemm created https://github.com/llvm/llvm-project/pull/73801
This PR extends libomptarget with a new plugin entry point to use a plugin's backend implementation for memory full operations (e.g., `hsa_amd_fill_memory` or `cuMemsetD32`) to implement `omp_target_memset()` and `omp_target_memset_async()`.
*This PR is not yet ready to merge, but should serve a place for having a discussion how to deal with this also for future cases.*
>From bc426e3d0da671b25cbdcdc3bb7f923ce5531a19 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Thu, 2 Nov 2023 13:14:43 +0100
Subject: [PATCH] libomptarget with mechanism to execute fill memory on the
target
---
openmp/libomptarget/include/device.h | 4 ++
openmp/libomptarget/include/omptargetplugin.h | 11 +++++
openmp/libomptarget/include/rtl.h | 5 +++
.../plugins-nextgen/amdgpu/src/rtl.cpp | 8 ++++
.../PluginInterface/PluginInterface.cpp | 31 ++++++++++++++
.../common/PluginInterface/PluginInterface.h | 6 +++
.../plugins-nextgen/cuda/src/rtl.cpp | 12 ++++++
.../generic-elf-64bit/src/rtl.cpp | 9 +++++
openmp/libomptarget/src/api.cpp | 40 ++++++++++++-------
openmp/libomptarget/src/device.cpp | 10 +++++
openmp/libomptarget/src/rtl.cpp | 4 ++
11 files changed, 125 insertions(+), 15 deletions(-)
diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index 74b59a4ab367c75..e71f0e22d8b7079 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -528,6 +528,10 @@ struct DeviceTy {
int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, AsyncInfoTy &AsyncInfo);
+ /// Fill memory on the target device (aka memset)
+ int32_t fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+ AsyncInfoTy &AsyncInfo);
+
/// Notify the plugin about a new mapping starting at the host address
/// \p HstPtr and \p Size bytes.
int32_t notifyDataMapped(void *HstPtr, int64_t Size);
diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h
index 8bdb39de9da9ec7..70f6c2fe0c07dce 100644
--- a/openmp/libomptarget/include/omptargetplugin.h
+++ b/openmp/libomptarget/include/omptargetplugin.h
@@ -109,6 +109,17 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
int32_t DesID, void *DstPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
+// Perform a memory fill operation on the target device (aka "memset") by
+// calling a native driver operation. In case of success, return zero.
+// Otherwise, return an error code.
+int32_t __tgt_rtl_fill_memory(int32_t DevID, void *Ptr, int32_t ByteVal,
+ int64_t NumBytes);
+
+// Asynchronous version of __tgt_rtl_fill_memory
+int32_t __tgt_rtl_fill_memory_async(int32_t DevID, void *Ptr, int32_t Val,
+ int64_t NumValues,
+ __tgt_async_info *AsyncInfo);
+
// De-allocate the data referenced by target ptr on the device. In case of
// success, return zero. Otherwise, return an error code. Kind dictates what
// allocator to use (e.g. shared, host, device).
diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index 49a62685dcdbfa7..1f9dfd8061390db 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -48,6 +48,9 @@ struct RTLInfoTy {
typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t);
typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *,
int64_t, __tgt_async_info *);
+ typedef int32_t(fill_memory_ty)(int32_t, void *, int32_t, uint64_t);
+ typedef int32_t(fill_memory_async_ty)(int32_t, void *, int32_t, uint64_t,
+ __tgt_async_info *);
typedef int32_t(data_delete_ty)(int32_t, void *, int32_t);
typedef int32_t(launch_kernel_ty)(int32_t, void *, void **, ptrdiff_t *,
const KernelArgsTy *, __tgt_async_info *);
@@ -101,6 +104,8 @@ struct RTLInfoTy {
data_retrieve_async_ty *data_retrieve_async = nullptr;
data_exchange_ty *data_exchange = nullptr;
data_exchange_async_ty *data_exchange_async = nullptr;
+ fill_memory_ty *fill_memory = nullptr;
+ fill_memory_async_ty *fill_memory_async = nullptr;
data_delete_ty *data_delete = nullptr;
launch_kernel_ty *launch_kernel = nullptr;
init_requires_ty *init_requires = nullptr;
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 4b8ac2f5f9ff517..641e14fc4648095 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2357,6 +2357,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
getAgent(), (uint64_t)Size);
}
+ /// Fill memory on the target device (aka memset)
+ Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+ AsyncInfoWrapperTy &AsyncInfoWrapperTy) override {
+ hsa_status_t Status =
+ hsa_amd_memory_fill(const_cast<void *>(Ptr), Val, NumValues);
+ return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s");
+ }
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 3b0b7de86a926ec..973d2334e1639ff 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -1453,6 +1453,14 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
return Err;
}
+Error GenericDeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+ auto Err = fillMemoryImpl(Ptr, Val, NumValues, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs,
@@ -1895,6 +1903,29 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
return OFFLOAD_SUCCESS;
}
+int32_t __tgt_rtl_fill_memory(int32_t DevId, void *Ptr, int32_t ByteVal,
+ int64_t NumBytes) {
+ return __tgt_rtl_fill_memory_async(DevId, Ptr, ByteVal, NumBytes,
+ /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_fill_memory_async(int32_t DevId, void *Ptr, int32_t Val,
+ int64_t NumValues,
+ __tgt_async_info *AsyncInfo) {
+ printf("--> in function %s\n", __FUNCTION__);
+ printf("--> Dev: %d, Ptr: %p, Val: %d, NumValues: %ld\n", DevId, Ptr, Val,
+ NumValues);
+ GenericDeviceTy &Device = Plugin::get().getDevice(DevId);
+ auto Err = Device.fillMemory(Ptr, Val, NumValues, AsyncInfo);
+ if (Err) {
+ REPORT("Failure to fill memory on device (%d) at pointer " DPxMOD
+ " with byte value %d and %" PRId64 " values: %s\n",
+ DevId, DPxPTR(Ptr), Val, NumValues, toString(std::move(Err)).data());
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
void **TgtArgs, ptrdiff_t *TgtOffsets,
KernelArgsTy *KernelArgs,
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 6abd1b6829ab554..bff6c4e8caf26a2 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -775,6 +775,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ /// Fill memory on the target device (aka memset).
+ Error fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+ __tgt_async_info *AsyncInfo);
+ virtual Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValue,
+ AsyncInfoWrapperTy &AsyncInfo) = 0;
+
/// Run the kernel associated with \p EntryPtr
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 97e49addc5608cb..2148739588e8242 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -746,6 +746,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+ // Fill memory on the target device (aka memset)
+ Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+ AsyncInfoWrapperTy &AsyncInfoWrapperTy) override;
+
/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
if (auto Err = setContext())
@@ -1387,6 +1391,14 @@ Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
}
+/// Fill memory on the target device (aka memset)
+Error CUDADeviceTy::fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+ AsyncInfoWrapperTy &AsyncInfoWrapperTy) {
+ CUdeviceptr DevPtr = reinterpret_cast<CUdeviceptr>(Ptr);
+ CUresult Res = cuMemsetD32(DevPtr, Val, static_cast<size_t>(NumValues));
+ return Plugin::check(Res, "Error in cuMemsetD32: %s");
+}
+
GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); }
GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index c0107c1f14f76fb..39864c54530b9a4 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -12,6 +12,7 @@
#include <cassert>
#include <cstddef>
+#include <cstring>
#include <ffi.h>
#include <string>
#include <unordered_map>
@@ -267,6 +268,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::error("dataExchangeImpl not supported");
}
+ /// Fill memory on the target device (aka memset).
+ Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+ AsyncInfoWrapperTy &AsyncInfoWrapperTy) override {
+ (void)std::memset(Ptr, Val,
+ static_cast<size_t>(NumValues) * sizeof(int32_t));
+ return Plugin::success();
+ }
+
/// All functions are already synchronous. No need to do anything on this
/// synchronization function.
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index ecef02c8a0d3d07..194f64e218b173f 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -320,21 +320,31 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
// That will require the ability to execute a kernel from within
// libomptarget.so (which we do not have at the moment).
- // This is a very slow path: create a filled array on the host and upload
- // it to the GPU device.
- int InitialDevice = omp_get_initial_device();
- void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
- if (Shadow) {
- (void)memset(Shadow, ByteVal, NumBytes);
- (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
- InitialDevice);
- (void)omp_target_free(Shadow, InitialDevice);
+ if (NumBytes % sizeof(int32_t) == 0) {
+ DeviceTy &Dev = *PM->Devices[DeviceNum];
+ AsyncInfoTy AsyncInfo(Dev);
+ int32_t Val =
+ ByteVal + (ByteVal << 8) + (ByteVal << 16) + (ByteVal << 24);
+ uint64_t NumValues = NumBytes / sizeof(int32_t);
+ int Rc = Dev.fillMemory(Ptr, Val, NumValues, AsyncInfo);
+ printf("--> Rc=%d\n", Rc);
} else {
- // If the omp_target_alloc has failed, let's just not do anything.
- // omp_target_memset does not have any good way to fail, so we
- // simply avoid a catastrophic failure of the process for now.
- DP("omp_target_memset failed to fill memory due to error with "
- "omp_target_alloc");
+ // This is a very slow path: create a filled array on the host and upload
+ // it to the GPU device.
+ int InitialDevice = omp_get_initial_device();
+ void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
+ if (Shadow) {
+ (void)memset(Shadow, ByteVal, NumBytes);
+ (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
+ InitialDevice);
+ (void)omp_target_free(Shadow, InitialDevice);
+ } else {
+ // If the omp_target_alloc has failed, let's just not do anything.
+ // omp_target_memset does not have any good way to fail, so we
+ // simply avoid a catastrophic failure of the process for now.
+ DP("omp_target_memset failed to fill memory due to error with "
+ "omp_target_alloc");
+ }
}
}
@@ -462,7 +472,7 @@ EXTERN int omp_target_memcpy_rect_async(
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
"volume " DPxMOD ", element size %zu, num_dims %d\n",
DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
- DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
+ DPxPTR(SrcOffsets), DPxPTR(DstDimensimons), DPxPTR(SrcDimensions),
DPxPTR(Volume), ElementSize, NumDims);
// Need to check this first to not return OFFLOAD_FAIL instead
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 87ee48082521712..e2a26900494a1dd 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -672,6 +672,16 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
DstPtr, Size, AsyncInfo);
}
+// Run a "fill memory" operation (aka "memset") on the target device
+int32_t DeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+ AsyncInfoTy &AsyncInfo) {
+ if (!AsyncInfo || !RTL->fill_memory_async || !RTL->synchronize) {
+ assert(RTL->fill_memory && "RTL->fill_memory is nullptr");
+ return RTL->fill_memory(RTLDeviceID, Ptr, Val, NumValues);
+ }
+ return RTL->fill_memory_async(RTLDeviceID, Ptr, Val, NumValues, AsyncInfo);
+}
+
int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
if (!RTL->data_notify_mapped)
return OFFLOAD_SUCCESS;
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 86509cd69c5614f..2a3ae09a9abe584 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -209,6 +209,10 @@ bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange");
*((void **)&RTL.data_exchange_async) =
DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async");
+ *((void **)&RTL.fill_memory) =
+ DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory");
+ *((void **)&RTL.fill_memory_async) =
+ DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory_async");
*((void **)&RTL.is_data_exchangable) =
DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable");
*((void **)&RTL.supports_empty_images) =
More information about the Openmp-commits
mailing list