[Openmp-commits] [openmp] aa616ef - [libomptarget][AMDGPU][NFC] Split atmi_memcpy for h2d and d2h

Pushpinder Singh via Openmp-commits openmp-commits at lists.llvm.org
Tue Oct 20 03:29:48 PDT 2020


Author: Pushpinder Singh
Date: 2020-10-20T06:29:32-04:00
New Revision: aa616efbb34e6321c0f24f61e017efdcf398ba04

URL: https://github.com/llvm/llvm-project/commit/aa616efbb34e6321c0f24f61e017efdcf398ba04
DIFF: https://github.com/llvm/llvm-project/commit/aa616efbb34e6321c0f24f61e017efdcf398ba04.diff

LOG: [libomptarget][AMDGPU][NFC] Split atmi_memcpy for h2d and d2h

The calls to atmi_memcpy presently determine the direction of copy (host to
device or device to host) by storing pointers in a map during malloc and
looking up the pointers during memcpy. As each call site already knows the
direction, this stash+lookup can be eliminated.

This NFC will be followed by a functional one that deletes those map lookups.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D89776

Change-Id: I1d9089bc1e56b3a9a30e334735fa07dee1f84990

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
    openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
index 6a6bc8c35c87..0586cd358c25 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
@@ -35,9 +35,28 @@ atmi_status_t atmi_module_register_from_memory_to_place(
  */
 atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
                           size_t size) {
+  hsa_status_t rc = hsa_memory_copy(dest, src, size);
+
+  // hsa_memory_copy sometimes fails in situations where
+  // allocate + copy succeeds. Looks like it might be related to
+  // locking part of a read only segment. Fall back for now.
+  if (rc == HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_SUCCESS;
+  }
+
   return core::Runtime::Memcpy(sig, dest, src, size);
 }
 
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
+                              const void *host_src, size_t size) {
+  return atmi_memcpy(sig, device_dest, host_src, size);
+}
+
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
+                              const void *device_src, size_t size) {
+  return atmi_memcpy(sig, host_dest, device_src, size);
+}
+
 atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }
 
 atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
index 8ee78dfca3e4..a935b6ad4b75 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
@@ -155,53 +155,11 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
  */
 atmi_status_t atmi_free(void *ptr);
 
-/**
- * @brief Syncrhonously copy memory from the source to destination memory
- * locations.
- *
- * @detail This function assumes that the source and destination regions are
- * non-overlapping. The runtime determines the memory place of the source and
- * the
- * destination and executes the appropriate optimized data movement methodology.
- *
- * @param[in] dest The destination pointer previously allocated by a system
- * allocator or @p atmi_malloc.
- *
- * @param[in] src The source pointer previously allocated by a system
- * allocator or @p atmi_malloc.
- *
- * @param[in] size The size of the data to be copied in bytes.
- *
- * @retval ::ATMI_STATUS_SUCCESS The function has executed successfully.
- *
- * @retval ::ATMI_STATUS_ERROR The function encountered errors.
- *
- * @retval ::ATMI_STATUS_UNKNOWN The function encountered errors.
- *
- */
-atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
-                          size_t size);
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
+                              const void *host_src, size_t size);
 
-static inline atmi_status_t atmi_memcpy_no_signal(void *dest, const void *src,
-                                                  size_t size) {
-  hsa_signal_t sig;
-  hsa_status_t err = hsa_signal_create(0, 0, NULL, &sig);
-  if (err != HSA_STATUS_SUCCESS) {
-    return ATMI_STATUS_ERROR;
-  }
-
-  atmi_status_t r = atmi_memcpy(sig, dest, src, size);
-  hsa_status_t rc = hsa_signal_destroy(sig);
-
-  if (r != ATMI_STATUS_SUCCESS) {
-    return r;
-  }
-  if (rc != HSA_STATUS_SUCCESS) {
-    return ATMI_STATUS_ERROR;
-  }
-
-  return ATMI_STATUS_SUCCESS;
-}
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
+                              const void *device_src, size_t size);
 
 /** @} */
 

diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index e0509a5f2b32..8fd7b7e34155 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -335,17 +335,29 @@ class RTLDeviceInfoTy {
   static const int Default_WG_Size =
       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
 
-  atmi_status_t freesignalpool_memcpy(void *dest, const void *src,
-                                      size_t size) {
+  using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *,
+                                   size_t size);
+  atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
+                                      MemcpyFunc Func) {
     hsa_signal_t s = FreeSignalPool.pop();
     if (s.handle == 0) {
       return ATMI_STATUS_ERROR;
     }
-    atmi_status_t r = atmi_memcpy(s, dest, src, size);
+    atmi_status_t r = Func(s, dest, src, size);
     FreeSignalPool.push(s);
     return r;
   }
 
+  atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
+                                          size_t size) {
+    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h);
+  }
+
+  atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
+                                          size_t size) {
+    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d);
+  }
+
   // Record entry point associated with device
   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
@@ -538,7 +550,7 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
      (long long unsigned)(Elf64_Addr)TgtPtr,
      (long long unsigned)(Elf64_Addr)HstPtr);
 
-  err = DeviceInfo.freesignalpool_memcpy(HstPtr, TgtPtr, (size_t)Size);
+  err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size);
 
   if (err != ATMI_STATUS_SUCCESS) {
     DP("Error when copying data from device to host. Pointers: "
@@ -564,7 +576,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)HstPtr,
      (long long unsigned)(Elf64_Addr)TgtPtr);
-  err = DeviceInfo.freesignalpool_memcpy(TgtPtr, HstPtr, (size_t)Size);
+  err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size);
   if (err != ATMI_STATUS_SUCCESS) {
     DP("Error when copying data from host to device. Pointers: "
        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -1021,7 +1033,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
     }
 
     // write ptr to device memory so it can be used by later kernels
-    err = DeviceInfo.freesignalpool_memcpy(state_ptr, &ptr, sizeof(void *));
+    err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *));
     if (err != ATMI_STATUS_SUCCESS) {
       fprintf(stderr, "memcpy install of state_ptr failed\n");
       return NULL;
@@ -1090,7 +1102,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
         // If unified memory is present any target link variables
         // can access host addresses directly. There is no longer a
         // need for device copies.
-        err = DeviceInfo.freesignalpool_memcpy(varptr, e->addr, sizeof(void *));
+        err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
+                                                   sizeof(void *));
         if (err != ATMI_STATUS_SUCCESS)
           DP("Error when copying USM\n");
         DP("Copy linked variable host address (" DPxMOD ")"
@@ -1518,8 +1531,8 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
   void *TgtPtr = NULL;
   atmi_status_t err =
       atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
-  err =
-      DeviceInfo.freesignalpool_memcpy(CallStackAddr, &TgtPtr, sizeof(void *));
+  err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr,
+                                             sizeof(void *));
   if (print_kernel_trace > 2)
     fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
             (long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);


        


More information about the Openmp-commits mailing list