[Openmp-commits] [openmp] cc8dc5e - [OpenMP][AMDGPU] Switch host-device memory copy to asynchronous version

Jon Chesterfield via Openmp-commits openmp-commits at lists.llvm.org
Wed Dec 8 15:02:53 PST 2021


Author: Carlo Bertolli
Date: 2021-12-08T23:02:39Z
New Revision: cc8dc5e28be84099ef4802157984a3522c503e02

URL: https://github.com/llvm/llvm-project/commit/cc8dc5e28be84099ef4802157984a3522c503e02
DIFF: https://github.com/llvm/llvm-project/commit/cc8dc5e28be84099ef4802157984a3522c503e02.diff

LOG: [OpenMP][AMDGPU] Switch host-device memory copy to asynchronous version

Prepare amdgpu plugin for asynchronous implementation. This patch switches to using HSA API for asynchronous memory copy.
Moving away from hsa_memory_copy means that plugin is responsible for locking/unlocking host memory pointers.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D115279

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
    openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
    openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
    openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
index 466e3ee19feb7..cb3931eeba80e 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
@@ -47,6 +47,8 @@ DLWRAP(hsa_amd_memory_pool_free, 1);
 DLWRAP(hsa_amd_memory_async_copy, 8);
 DLWRAP(hsa_amd_memory_pool_get_info, 3);
 DLWRAP(hsa_amd_agents_allow_access, 4);
+DLWRAP(hsa_amd_memory_lock, 5);
+DLWRAP(hsa_amd_memory_unlock, 1);
 DLWRAP(hsa_amd_memory_fill, 3);
 DLWRAP(hsa_amd_register_system_event_handler, 2);
 

diff  --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 41219fd15dc47..a8662ee7b26e7 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -76,6 +76,12 @@ hsa_status_t hsa_amd_agents_allow_access(uint32_t num_agents,
                                          const uint32_t *flags,
                                          const void *ptr);
 
+hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
+                                hsa_agent_t* agents, int num_agent,
+                                void** agent_ptr);
+
+hsa_status_t hsa_amd_memory_unlock(void* host_ptr);
+
 hsa_status_t hsa_amd_memory_fill(void *ptr, uint32_t value, size_t count);
 
 typedef enum hsa_amd_event_type_s {

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
index 7f841aaa8f0d2..d41ca9713623d 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
@@ -5,9 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "hsa_api.h"
-#include "impl_runtime.h"
-#include "internal.h"
 #include "rt.h"
 #include <memory>
 
@@ -15,29 +12,27 @@
  * Data
  */
 
-static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
-                                    const void *src, size_t size,
-                                    hsa_agent_t agent) {
+// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
+static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
+                                    hsa_agent_t agent, const void *src,
+                                    size_t size) {
   const hsa_signal_value_t init = 1;
   const hsa_signal_value_t success = 0;
-  hsa_signal_store_screlease(sig, init);
+  hsa_signal_store_screlease(signal, init);
 
-  hsa_status_t err =
-      hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
-  if (err != HSA_STATUS_SUCCESS) {
+  hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
+                                               nullptr, signal);
+  if (err != HSA_STATUS_SUCCESS)
     return err;
-  }
 
   // async_copy reports success by decrementing and failure by setting to < 0
   hsa_signal_value_t got = init;
-  while (got == init) {
-    got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
+  while (got == init)
+    got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
                                     UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-  }
 
-  if (got != success) {
+  if (got != success)
     return HSA_STATUS_ERROR;
-  }
 
   return err;
 }
@@ -48,19 +43,58 @@ struct implFreePtrDeletor {
   }
 };
 
+enum CopyDirection { H2D, D2H };
+
+static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
+                                         hsa_signal_t signal, void *dest,
+                                         hsa_agent_t agent, void *src,
+                                         void *lockingPtr, size_t size) {
+  hsa_status_t err;
+
+  void *lockedPtr = nullptr;
+  err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
+  if (err != HSA_STATUS_SUCCESS)
+    return err;
+
+  switch (direction) {
+  case H2D:
+    err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
+    break;
+  case D2H:
+    err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
+    break;
+  default:
+    err = HSA_STATUS_ERROR; // fall into unlock before returning
+  }
+
+  if (err != HSA_STATUS_SUCCESS) {
+    // do not leak locked host pointers, but discard potential error message
+    hsa_amd_memory_unlock(lockingPtr);
+    return err;
+  }
+
+  err = hsa_amd_memory_unlock(lockingPtr);
+  if (err != HSA_STATUS_SUCCESS)
+    return err;
+
+  return HSA_STATUS_SUCCESS;
+}
+
 hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
-                             const void *hostSrc, size_t size,
-                             hsa_agent_t agent,
+                             void *hostSrc, size_t size,
+                             hsa_agent_t device_agent,
                              hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size);
+  hsa_status_t err;
 
-  // hsa_memory_copy sometimes fails in situations where
+  err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
+                             device_agent, hostSrc, hostSrc, size);
+
+  if (err == HSA_STATUS_SUCCESS)
+    return err;
+
+  // async memcpy sometimes fails in situations where
   // allocate + copy succeeds. Looks like it might be related to
   // locking part of a read only segment. Fall back for now.
-  if (rc == HSA_STATUS_SUCCESS) {
-    return HSA_STATUS_SUCCESS;
-  }
-
   void *tempHostPtr;
   hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
   if (ret != HSA_STATUS_SUCCESS) {
@@ -70,26 +104,26 @@ hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
   std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
   memcpy(tempHostPtr, hostSrc, size);
 
-  if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) !=
-      HSA_STATUS_SUCCESS) {
-    return HSA_STATUS_ERROR;
-  }
-  return HSA_STATUS_SUCCESS;
+  return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
+                              device_agent, tempHostPtr, tempHostPtr, size);
 }
 
-hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest,
-                             const void *deviceSrc, size_t size,
-                             hsa_agent_t agent,
+hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
+                             void *deviceSrc, size_t size,
+                             hsa_agent_t deviceAgent,
                              hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size);
+  hsa_status_t err;
+
+  // device has always visibility over both pointers, so use that
+  err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
+                             deviceSrc, hostDest, size);
+
+  if (err == HSA_STATUS_SUCCESS)
+    return err;
 
   // hsa_memory_copy sometimes fails in situations where
   // allocate + copy succeeds. Looks like it might be related to
   // locking part of a read only segment. Fall back for now.
-  if (rc == HSA_STATUS_SUCCESS) {
-    return HSA_STATUS_SUCCESS;
-  }
-
   void *tempHostPtr;
   hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
   if (ret != HSA_STATUS_SUCCESS) {
@@ -98,11 +132,11 @@ hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest,
   }
   std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
 
-  if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) !=
-      HSA_STATUS_SUCCESS) {
+  err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
+                             deviceAgent, deviceSrc, tempHostPtr, size);
+  if (err != HSA_STATUS_SUCCESS)
     return HSA_STATUS_ERROR;
-  }
 
-  memcpy(dest, tempHostPtr, size);
+  memcpy(hostDest, tempHostPtr, size);
   return HSA_STATUS_SUCCESS;
 }

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
index 52efaffb515f0..c99b3e752ccec 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
@@ -19,13 +19,12 @@ hsa_status_t impl_module_register_from_memory_to_place(
     void *cb_state);
 
 hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
-                             const void *hostSrc, size_t size,
-                             hsa_agent_t agent,
+                             void *hostSrc, size_t size,
+                             hsa_agent_t device_agent,
                              hsa_amd_memory_pool_t MemoryPool);
 
-hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest,
-                             const void *deviceSrc, size_t size,
-                             hsa_agent_t agent,
+hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc,
+                             size_t size, hsa_agent_t device_agent,
                              hsa_amd_memory_pool_t MemoryPool);
 }
 

diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 5434692d0119d..2da012daadc43 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -464,10 +464,9 @@ class RTLDeviceInfoTy {
                 "");
   static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size;
 
-  using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
-                                      size_t size, hsa_agent_t,
-                                      hsa_amd_memory_pool_t);
-  hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
+  using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t size,
+                                      hsa_agent_t, hsa_amd_memory_pool_t);
+  hsa_status_t freesignalpool_memcpy(void *dest, void *src, size_t size,
                                      MemcpyFunc Func, int32_t deviceId) {
     hsa_agent_t agent = HSAAgents[deviceId];
     hsa_signal_t s = FreeSignalPool.pop();
@@ -479,13 +478,13 @@ class RTLDeviceInfoTy {
     return r;
   }
 
-  hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
-                                         size_t size, int32_t deviceId) {
+  hsa_status_t freesignalpool_memcpy_d2h(void *dest, void *src, size_t size,
+                                         int32_t deviceId) {
     return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId);
   }
 
-  hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
-                                         size_t size, int32_t deviceId) {
+  hsa_status_t freesignalpool_memcpy_h2d(void *dest, void *src, size_t size,
+                                         int32_t deviceId) {
     return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId);
   }
 


        


More information about the Openmp-commits mailing list