[Openmp-commits] [openmp] [OpenMP][libomptarget][RFC] extend libomptarget with mechanism to execute fill memory on the target (PR #73801)

Michael Klemm via Openmp-commits openmp-commits at lists.llvm.org
Wed Nov 29 07:00:26 PST 2023


https://github.com/mjklemm created https://github.com/llvm/llvm-project/pull/73801

This PR extends libomptarget with a new plugin entry point to use a plugin's backend implementation for memory full operations (e.g., `hsa_amd_fill_memory` or `cuMemsetD32`) to implement `omp_target_memset()` and `omp_target_memset_async()`.

*This PR is not yet ready to merge, but should serve a place for having a discussion how to deal with this also for future cases.*

>From bc426e3d0da671b25cbdcdc3bb7f923ce5531a19 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm at amd.com>
Date: Thu, 2 Nov 2023 13:14:43 +0100
Subject: [PATCH] libomptarget with mechanism to execute fill memory on the
 target

---
 openmp/libomptarget/include/device.h          |  4 ++
 openmp/libomptarget/include/omptargetplugin.h | 11 +++++
 openmp/libomptarget/include/rtl.h             |  5 +++
 .../plugins-nextgen/amdgpu/src/rtl.cpp        |  8 ++++
 .../PluginInterface/PluginInterface.cpp       | 31 ++++++++++++++
 .../common/PluginInterface/PluginInterface.h  |  6 +++
 .../plugins-nextgen/cuda/src/rtl.cpp          | 12 ++++++
 .../generic-elf-64bit/src/rtl.cpp             |  9 +++++
 openmp/libomptarget/src/api.cpp               | 40 ++++++++++++-------
 openmp/libomptarget/src/device.cpp            | 10 +++++
 openmp/libomptarget/src/rtl.cpp               |  4 ++
 11 files changed, 125 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index 74b59a4ab367c75..e71f0e22d8b7079 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -528,6 +528,10 @@ struct DeviceTy {
   int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                        int64_t Size, AsyncInfoTy &AsyncInfo);
 
+  /// Fill memory on the target device (aka memset)
+  int32_t fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+                     AsyncInfoTy &AsyncInfo);
+
   /// Notify the plugin about a new mapping starting at the host address
   /// \p HstPtr and \p Size bytes.
   int32_t notifyDataMapped(void *HstPtr, int64_t Size);
diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h
index 8bdb39de9da9ec7..70f6c2fe0c07dce 100644
--- a/openmp/libomptarget/include/omptargetplugin.h
+++ b/openmp/libomptarget/include/omptargetplugin.h
@@ -109,6 +109,17 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
                                       int32_t DesID, void *DstPtr, int64_t Size,
                                       __tgt_async_info *AsyncInfo);
 
+// Perform a memory fill operation on the target device (aka "memset") by
+// calling a native driver operation.  In case of success, return zero.
+// Otherwise, return an error code.
+int32_t __tgt_rtl_fill_memory(int32_t DevID, void *Ptr, int32_t ByteVal,
+                              int64_t NumBytes);
+
+// Asynchronous version of __tgt_rtl_fill_memory
+int32_t __tgt_rtl_fill_memory_async(int32_t DevID, void *Ptr, int32_t Val,
+                                    int64_t NumValues,
+                                    __tgt_async_info *AsyncInfo);
+
 // De-allocate the data referenced by target ptr on the device. In case of
 // success, return zero. Otherwise, return an error code. Kind dictates what
 // allocator to use (e.g. shared, host, device).
diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index 49a62685dcdbfa7..1f9dfd8061390db 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -48,6 +48,9 @@ struct RTLInfoTy {
   typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t);
   typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *,
                                           int64_t, __tgt_async_info *);
+  typedef int32_t(fill_memory_ty)(int32_t, void *, int32_t, uint64_t);
+  typedef int32_t(fill_memory_async_ty)(int32_t, void *, int32_t, uint64_t,
+                                        __tgt_async_info *);
   typedef int32_t(data_delete_ty)(int32_t, void *, int32_t);
   typedef int32_t(launch_kernel_ty)(int32_t, void *, void **, ptrdiff_t *,
                                     const KernelArgsTy *, __tgt_async_info *);
@@ -101,6 +104,8 @@ struct RTLInfoTy {
   data_retrieve_async_ty *data_retrieve_async = nullptr;
   data_exchange_ty *data_exchange = nullptr;
   data_exchange_async_ty *data_exchange_async = nullptr;
+  fill_memory_ty *fill_memory = nullptr;
+  fill_memory_async_ty *fill_memory_async = nullptr;
   data_delete_ty *data_delete = nullptr;
   launch_kernel_ty *launch_kernel = nullptr;
   init_requires_ty *init_requires = nullptr;
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 4b8ac2f5f9ff517..641e14fc4648095 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2357,6 +2357,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                           getAgent(), (uint64_t)Size);
   }
 
+  /// Fill memory on the target device (aka memset)
+  Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+                       AsyncInfoWrapperTy &AsyncInfoWrapperTy) override {
+    hsa_status_t Status =
+        hsa_amd_memory_fill(const_cast<void *>(Ptr), Val, NumValues);
+    return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s");
+  }
+
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     // TODO: Implement this function.
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 3b0b7de86a926ec..973d2334e1639ff 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -1453,6 +1453,14 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
   return Err;
 }
 
+Error GenericDeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+                                  __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  auto Err = fillMemoryImpl(Ptr, Val, NumValues, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
                                     ptrdiff_t *ArgOffsets,
                                     KernelArgsTy &KernelArgs,
@@ -1895,6 +1903,29 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
   return OFFLOAD_SUCCESS;
 }
 
+int32_t __tgt_rtl_fill_memory(int32_t DevId, void *Ptr, int32_t ByteVal,
+                              int64_t NumBytes) {
+  return __tgt_rtl_fill_memory_async(DevId, Ptr, ByteVal, NumBytes,
+                                     /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_fill_memory_async(int32_t DevId, void *Ptr, int32_t Val,
+                                    int64_t NumValues,
+                                    __tgt_async_info *AsyncInfo) {
+  printf("--> in function %s\n", __FUNCTION__);
+  printf("--> Dev: %d, Ptr: %p, Val: %d, NumValues: %ld\n", DevId, Ptr, Val,
+         NumValues);
+  GenericDeviceTy &Device = Plugin::get().getDevice(DevId);
+  auto Err = Device.fillMemory(Ptr, Val, NumValues, AsyncInfo);
+  if (Err) {
+    REPORT("Failure to fill memory on device (%d) at pointer " DPxMOD
+           " with byte value %d and %" PRId64 " values: %s\n",
+           DevId, DPxPTR(Ptr), Val, NumValues, toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
 int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
                                 void **TgtArgs, ptrdiff_t *TgtOffsets,
                                 KernelArgsTy *KernelArgs,
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 6abd1b6829ab554..bff6c4e8caf26a2 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -775,6 +775,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                  void *DstPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Fill memory on the target device (aka memset).
+  Error fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+                   __tgt_async_info *AsyncInfo);
+  virtual Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValue,
+                               AsyncInfoWrapperTy &AsyncInfo) = 0;
+
   /// Run the kernel associated with \p EntryPtr
   Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
                      KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 97e49addc5608cb..2148739588e8242 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -746,6 +746,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
                          void *DstPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 
+  // Fill memory on the target device (aka memset)
+  Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+                       AsyncInfoWrapperTy &AsyncInfoWrapperTy) override;
+
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     if (auto Err = setContext())
@@ -1387,6 +1391,14 @@ Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
   return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
 }
 
+/// Fill memory on the target device (aka memset)
+Error CUDADeviceTy::fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapperTy) {
+  CUdeviceptr DevPtr = reinterpret_cast<CUdeviceptr>(Ptr);
+  CUresult Res = cuMemsetD32(DevPtr, Val, static_cast<size_t>(NumValues));
+  return Plugin::check(Res, "Error in cuMemsetD32: %s");
+}
+
 GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); }
 
 GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index c0107c1f14f76fb..39864c54530b9a4 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -12,6 +12,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <cstring>
 #include <ffi.h>
 #include <string>
 #include <unordered_map>
@@ -267,6 +268,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Plugin::error("dataExchangeImpl not supported");
   }
 
+  /// Fill memory on the target device (aka memset).
+  Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
+                       AsyncInfoWrapperTy &AsyncInfoWrapperTy) override {
+    (void)std::memset(Ptr, Val,
+                      static_cast<size_t>(NumValues) * sizeof(int32_t));
+    return Plugin::success();
+  }
+
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index ecef02c8a0d3d07..194f64e218b173f 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -320,21 +320,31 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
     // That will require the ability to execute a kernel from within
     // libomptarget.so (which we do not have at the moment).
 
-    // This is a very slow path: create a filled array on the host and upload
-    // it to the GPU device.
-    int InitialDevice = omp_get_initial_device();
-    void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
-    if (Shadow) {
-      (void)memset(Shadow, ByteVal, NumBytes);
-      (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
-                              InitialDevice);
-      (void)omp_target_free(Shadow, InitialDevice);
+    if (NumBytes % sizeof(int32_t) == 0) {
+      DeviceTy &Dev = *PM->Devices[DeviceNum];
+      AsyncInfoTy AsyncInfo(Dev);
+      int32_t Val =
+          ByteVal + (ByteVal << 8) + (ByteVal << 16) + (ByteVal << 24);
+      uint64_t NumValues = NumBytes / sizeof(int32_t);
+      int Rc = Dev.fillMemory(Ptr, Val, NumValues, AsyncInfo);
+      printf("--> Rc=%d\n", Rc);
     } else {
-      // If the omp_target_alloc has failed, let's just not do anything.
-      // omp_target_memset does not have any good way to fail, so we
-      // simply avoid a catastrophic failure of the process for now.
-      DP("omp_target_memset failed to fill memory due to error with "
-         "omp_target_alloc");
+      // This is a very slow path: create a filled array on the host and upload
+      // it to the GPU device.
+      int InitialDevice = omp_get_initial_device();
+      void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
+      if (Shadow) {
+        (void)memset(Shadow, ByteVal, NumBytes);
+        (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
+                                InitialDevice);
+        (void)omp_target_free(Shadow, InitialDevice);
+      } else {
+        // If the omp_target_alloc has failed, let's just not do anything.
+        // omp_target_memset does not have any good way to fail, so we
+        // simply avoid a catastrophic failure of the process for now.
+        DP("omp_target_memset failed to fill memory due to error with "
+           "omp_target_alloc");
+      }
     }
   }
 
@@ -462,7 +472,7 @@ EXTERN int omp_target_memcpy_rect_async(
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
      "volume " DPxMOD ", element size %zu, num_dims %d\n",
      DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
-     DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
+     DPxPTR(SrcOffsets), DPxPTR(DstDimensimons), DPxPTR(SrcDimensions),
      DPxPTR(Volume), ElementSize, NumDims);
 
   // Need to check this first to not return OFFLOAD_FAIL instead
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 87ee48082521712..e2a26900494a1dd 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -672,6 +672,16 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                                   DstPtr, Size, AsyncInfo);
 }
 
+// Run a "fill memory" operation (aka "memset") on the target device
+int32_t DeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
+                             AsyncInfoTy &AsyncInfo) {
+  if (!AsyncInfo || !RTL->fill_memory_async || !RTL->synchronize) {
+    assert(RTL->fill_memory && "RTL->fill_memory is nullptr");
+    return RTL->fill_memory(RTLDeviceID, Ptr, Val, NumValues);
+  }
+  return RTL->fill_memory_async(RTLDeviceID, Ptr, Val, NumValues, AsyncInfo);
+}
+
 int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
   if (!RTL->data_notify_mapped)
     return OFFLOAD_SUCCESS;
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 86509cd69c5614f..2a3ae09a9abe584 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -209,6 +209,10 @@ bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
       DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange");
   *((void **)&RTL.data_exchange_async) =
       DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async");
+  *((void **)&RTL.fill_memory) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory");
+  *((void **)&RTL.fill_memory_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory_async");
   *((void **)&RTL.is_data_exchangable) =
       DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable");
   *((void **)&RTL.supports_empty_images) =



More information about the Openmp-commits mailing list