[Openmp-commits] [openmp] 55dc123 - [libomptarget][amdgcn] Refactor memcpy to eliminate maps

via Openmp-commits openmp-commits at lists.llvm.org
Wed Oct 21 08:59:53 PDT 2020


Author: JonChesterfield
Date: 2020-10-21T16:59:33+01:00
New Revision: 55dc123555dbb3e7ca717c1ecc0ab6cd934bdee5

URL: https://github.com/llvm/llvm-project/commit/55dc123555dbb3e7ca717c1ecc0ab6cd934bdee5
DIFF: https://github.com/llvm/llvm-project/commit/55dc123555dbb3e7ca717c1ecc0ab6cd934bdee5.diff

LOG: [libomptarget][amdgcn] Refactor memcpy to eliminate maps

[libomptarget][amdgcn] Refactor memcpy to eliminate maps

Builds on D89776 to remove now dead code.

Reviewed By: pdhaliwal

Differential Revision: https://reviews.llvm.org/D89888

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
    openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
    openmp/libomptarget/plugins/amdgpu/impl/data.cpp
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    openmp/libomptarget/plugins/amdgpu/impl/data.h


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
index 0586cd358c25..285dc2dbe763 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
@@ -3,7 +3,13 @@
  *
  * This file is distributed under the MIT License. See LICENSE.txt for details.
  *===------------------------------------------------------------------------*/
+#include "atmi_runtime.h"
+#include "internal.h"
 #include "rt.h"
+#include <hsa.h>
+#include <hsa_ext_amd.h>
+#include <memory>
+
 /*
  * Initialize/Finalize
  */
@@ -33,9 +39,44 @@ atmi_status_t atmi_module_register_from_memory_to_place(
 /*
  * Data
  */
-atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
-                          size_t size) {
-  hsa_status_t rc = hsa_memory_copy(dest, src, size);
+
+static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
+                                    const void *src, size_t size,
+                                    hsa_agent_t agent) {
+  const hsa_signal_value_t init = 1;
+  const hsa_signal_value_t success = 0;
+  hsa_signal_store_screlease(sig, init);
+
+  hsa_status_t err =
+      hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
+  if (err != HSA_STATUS_SUCCESS) {
+    return err;
+  }
+
+  // async_copy reports success by decrementing and failure by setting to < 0
+  hsa_signal_value_t got = init;
+  while (got == init) {
+    got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
+                                    UINT64_MAX, ATMI_WAIT_STATE);
+  }
+
+  if (got != success) {
+    return HSA_STATUS_ERROR;
+  }
+
+  return err;
+}
+
+struct atmiFreePtrDeletor {
+  void operator()(void *p) {
+    atmi_free(p); // ignore failure to free
+  }
+};
+
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
+                              const void *hostSrc, size_t size,
+                              hsa_agent_t agent) {
+  hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size);
 
   // hsa_memory_copy sometimes fails in situations where
   // allocate + copy succeeds. Looks like it might be related to
@@ -44,17 +85,53 @@ atmi_status_t atmi_memcpy(hsa_signal_t sig, void *dest, const void *src,
     return ATMI_STATUS_SUCCESS;
   }
 
-  return core::Runtime::Memcpy(sig, dest, src, size);
-}
+  void *tempHostPtr;
+  atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
+  atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU);
+  if (ret != ATMI_STATUS_SUCCESS) {
+    DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n",
+                size);
+    return ret;
+  }
+  std::unique_ptr<void, atmiFreePtrDeletor> del(tempHostPtr);
+  memcpy(tempHostPtr, hostSrc, size);
 
-atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
-                              const void *host_src, size_t size) {
-  return atmi_memcpy(sig, device_dest, host_src, size);
+  if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) !=
+      HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_ERROR;
+  }
+  return ATMI_STATUS_SUCCESS;
 }
 
-atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
-                              const void *device_src, size_t size) {
-  return atmi_memcpy(sig, host_dest, device_src, size);
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t signal, void *dest,
+                              const void *deviceSrc, size_t size,
+                              hsa_agent_t agent) {
+  hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size);
+
+  // hsa_memory_copy sometimes fails in situations where
+  // allocate + copy succeeds. Looks like it might be related to
+  // locking part of a read only segment. Fall back for now.
+  if (rc == HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_SUCCESS;
+  }
+
+  void *tempHostPtr;
+  atmi_mem_place_t CPU = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
+  atmi_status_t ret = atmi_malloc(&tempHostPtr, size, CPU);
+  if (ret != ATMI_STATUS_SUCCESS) {
+    DEBUG_PRINT("atmi_malloc: Unable to alloc %d bytes for temp scratch\n",
+                size);
+    return ret;
+  }
+  std::unique_ptr<void, atmiFreePtrDeletor> del(tempHostPtr);
+
+  if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) !=
+      HSA_STATUS_SUCCESS) {
+    return ATMI_STATUS_ERROR;
+  }
+
+  memcpy(dest, tempHostPtr, size);
+  return ATMI_STATUS_SUCCESS;
 }
 
 atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
index a935b6ad4b75..47022f7f5dea 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/atmi_runtime.h
@@ -155,11 +155,13 @@ atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place);
  */
 atmi_status_t atmi_free(void *ptr);
 
-atmi_status_t atmi_memcpy_h2d(hsa_signal_t sig, void *device_dest,
-                              const void *host_src, size_t size);
+atmi_status_t atmi_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
+                              const void *hostSrc, size_t size,
+                              hsa_agent_t agent);
 
-atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *host_dest,
-                              const void *device_src, size_t size);
+atmi_status_t atmi_memcpy_d2h(hsa_signal_t sig, void *hostDest,
+                              const void *deviceSrc, size_t size,
+                              hsa_agent_t agent);
 
 /** @} */
 

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
index 91627bd8d4b3..39546fbae4b3 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
@@ -3,7 +3,6 @@
  *
  * This file is distributed under the MIT License. See LICENSE.txt for details.
  *===------------------------------------------------------------------------*/
-#include "data.h"
 #include "atmi_runtime.h"
 #include "internal.h"
 #include "machine.h"
@@ -21,7 +20,6 @@ using core::TaskImpl;
 extern ATLMachine g_atl_machine;
 
 namespace core {
-ATLPointerTracker g_data_map; // Track all am pointer allocations.
 void allow_access_to_all_gpu_agents(void *ptr);
 
 const char *getPlaceStr(atmi_devtype_t type) {
@@ -35,37 +33,6 @@ const char *getPlaceStr(atmi_devtype_t type) {
   }
 }
 
-std::ostream &operator<<(std::ostream &os, const ATLData *ap) {
-  atmi_mem_place_t place = ap->place();
-  os << " devicePointer:" << ap->ptr() << " sizeBytes:" << ap->size()
-     << " place:(" << getPlaceStr(place.dev_type) << ", " << place.dev_id
-     << ", " << place.mem_id << ")";
-  return os;
-}
-
-void ATLPointerTracker::insert(void *pointer, ATLData *p) {
-  std::lock_guard<std::mutex> l(mutex_);
-
-  DEBUG_PRINT("insert: %p + %zu\n", pointer, p->size());
-  tracker_.insert(std::make_pair(ATLMemoryRange(pointer, p->size()), p));
-}
-
-void ATLPointerTracker::remove(void *pointer) {
-  std::lock_guard<std::mutex> l(mutex_);
-  DEBUG_PRINT("remove: %p\n", pointer);
-  tracker_.erase(ATLMemoryRange(pointer, 1));
-}
-
-ATLData *ATLPointerTracker::find(const void *pointer) {
-  std::lock_guard<std::mutex> l(mutex_);
-  ATLData *ret = NULL;
-  auto iter = tracker_.find(ATLMemoryRange(pointer, 1));
-  DEBUG_PRINT("find: %p\n", pointer);
-  if (iter != tracker_.end()) // found
-    ret = iter->second;
-  return ret;
-}
-
 ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) {
   int dev_id = place.dev_id;
   switch (place.dev_type) {
@@ -76,18 +43,12 @@ ATLProcessor &get_processor_by_mem_place(atmi_mem_place_t place) {
   }
 }
 
-static hsa_agent_t get_mem_agent(atmi_mem_place_t place) {
-  return get_processor_by_mem_place(place).agent();
-}
-
 hsa_amd_memory_pool_t get_memory_pool_by_mem_place(atmi_mem_place_t place) {
   ATLProcessor &proc = get_processor_by_mem_place(place);
   return get_memory_pool(proc, place.mem_id);
 }
 
 void register_allocation(void *ptr, size_t size, atmi_mem_place_t place) {
-  ATLData *data = new ATLData(ptr, size, place);
-  g_data_map.insert(ptr, data);
   if (place.dev_type == ATMI_DEVTYPE_CPU)
     allow_access_to_all_gpu_agents(ptr);
   // TODO(ashwinma): what if one GPU wants to access another GPU?
@@ -112,103 +73,13 @@ atmi_status_t Runtime::Malloc(void **ptr, size_t size, atmi_mem_place_t place) {
 atmi_status_t Runtime::Memfree(void *ptr) {
   atmi_status_t ret = ATMI_STATUS_SUCCESS;
   hsa_status_t err;
-  ATLData *data = g_data_map.find(ptr);
-  if (!data)
-    ErrorCheck(Checking pointer info userData,
-               HSA_STATUS_ERROR_INVALID_ALLOCATION);
-
-  g_data_map.remove(ptr);
-  delete data;
-
   err = hsa_amd_memory_pool_free(ptr);
   ErrorCheck(atmi_free, err);
   DEBUG_PRINT("Freed %p\n", ptr);
 
-  if (err != HSA_STATUS_SUCCESS || !data)
+  if (err != HSA_STATUS_SUCCESS)
     ret = ATMI_STATUS_ERROR;
   return ret;
 }
 
-static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest,
-                                    const void *src, size_t size,
-                                    hsa_agent_t agent) {
-  const hsa_signal_value_t init = 1;
-  const hsa_signal_value_t success = 0;
-  hsa_signal_store_screlease(sig, init);
-
-  hsa_status_t err =
-      hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig);
-  if (err != HSA_STATUS_SUCCESS) {
-    return err;
-  }
-
-  // async_copy reports success by decrementing and failure by setting to < 0
-  hsa_signal_value_t got = init;
-  while (got == init) {
-    got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init,
-                                    UINT64_MAX, ATMI_WAIT_STATE);
-  }
-
-  if (got != success) {
-    return HSA_STATUS_ERROR;
-  }
-
-  return err;
-}
-
-struct atmiFreePtrDeletor {
-  void operator()(void *p) {
-    atmi_free(p); // ignore failure to free
-  }
-};
-
-atmi_status_t Runtime::Memcpy(hsa_signal_t sig, void *dest, const void *src,
-                              size_t size) {
-  ATLData *src_data = g_data_map.find(src);
-  ATLData *dest_data = g_data_map.find(dest);
-  atmi_mem_place_t cpu = ATMI_MEM_PLACE_CPU_MEM(0, 0, 0);
-
-  void *temp_host_ptr;
-  atmi_status_t ret = atmi_malloc(&temp_host_ptr, size, cpu);
-  if (ret != ATMI_STATUS_SUCCESS) {
-    return ret;
-  }
-  std::unique_ptr<void, atmiFreePtrDeletor> del(temp_host_ptr);
-
-  if (src_data && !dest_data) {
-    // Copy from device to scratch to host
-    hsa_agent_t agent = get_mem_agent(src_data->place());
-    DEBUG_PRINT("Memcpy D2H device agent: %lu\n", agent.handle);
-
-    if (invoke_hsa_copy(sig, temp_host_ptr, src, size, agent) !=
-        HSA_STATUS_SUCCESS) {
-      return ATMI_STATUS_ERROR;
-    }
-
-    memcpy(dest, temp_host_ptr, size);
-
-  } else if (!src_data && dest_data) {
-    // Copy from host to scratch to device
-    hsa_agent_t agent = get_mem_agent(dest_data->place());
-    DEBUG_PRINT("Memcpy H2D device agent: %lu\n", agent.handle);
-
-    memcpy(temp_host_ptr, src, size);
-
-    if (invoke_hsa_copy(sig, dest, temp_host_ptr, size, agent) !=
-        HSA_STATUS_SUCCESS) {
-      return ATMI_STATUS_ERROR;
-    }
-
-  } else if (!src_data && !dest_data) {
-    // would be host to host, just call memcpy, or missing metadata
-    DEBUG_PRINT("atmi_memcpy invoked without metadata\n");
-    return ATMI_STATUS_ERROR;
-  } else {
-    DEBUG_PRINT("atmi_memcpy unimplemented device to device copy\n");
-    return ATMI_STATUS_ERROR;
-  }
-
-  return ATMI_STATUS_SUCCESS;
-}
-
 } // namespace core

diff  --git a/openmp/libomptarget/plugins/amdgpu/impl/data.h b/openmp/libomptarget/plugins/amdgpu/impl/data.h
deleted file mode 100644
index fa9e7380bf67..000000000000
--- a/openmp/libomptarget/plugins/amdgpu/impl/data.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*===--------------------------------------------------------------------------
- *              ATMI (Asynchronous Task and Memory Interface)
- *
- * This file is distributed under the MIT License. See LICENSE.txt for details.
- *===------------------------------------------------------------------------*/
-#ifndef SRC_RUNTIME_INCLUDE_DATA_H_
-#define SRC_RUNTIME_INCLUDE_DATA_H_
-#include "atmi.h"
-#include <hsa.h>
-#include <map>
-#include <mutex>
-#include <stdio.h>
-#include <stdlib.h>
-// we maintain our own mapping of device addr to a user specified data object
-// in order to work around a (possibly historic) bug in ROCr's
-// hsa_amd_pointer_info_set_userdata for variable symbols
-// this is expected to be temporary
-
-namespace core {
-// Internal representation of any data that is created and managed by ATMI.
-// Data can be located on any device memory or host memory.
-class ATLData {
-public:
-  ATLData(void *ptr, size_t size, atmi_mem_place_t place)
-      : ptr_(ptr), size_(size), place_(place) {}
-
-  void *ptr() const { return ptr_; }
-  size_t size() const { return size_; }
-  atmi_mem_place_t place() const { return place_; }
-
-private:
-  void *ptr_;
-  size_t size_;
-  atmi_mem_place_t place_;
-};
-
-//---
-struct ATLMemoryRange {
-  const void *base_pointer;
-  const void *end_pointer;
-  ATLMemoryRange(const void *bp, size_t size_bytes)
-      : base_pointer(bp),
-        end_pointer(reinterpret_cast<const unsigned char *>(bp) + size_bytes -
-                    1) {}
-};
-
-// Functor to compare ranges:
-struct ATLMemoryRangeCompare {
-  // Return true is LHS range is less than RHS - used to order the ranges
-  bool operator()(const ATLMemoryRange &lhs, const ATLMemoryRange &rhs) const {
-    return lhs.end_pointer < rhs.base_pointer;
-  }
-};
-
-//-------------------------------------------------------------------------------------------------
-// This structure tracks information for each pointer.
-// Uses memory-range-based lookups - so pointers that exist anywhere in the
-// range of hostPtr + size
-// will find the associated ATLPointerInfo.
-// The insertions and lookups use a self-balancing binary tree and should
-// support O(logN) lookup speed.
-// The structure is thread-safe - writers obtain a mutex before modifying the
-// tree.  Multiple simulatenous readers are supported.
-class ATLPointerTracker {
-  typedef std::map<ATLMemoryRange, ATLData *, ATLMemoryRangeCompare>
-      MapTrackerType;
-
-public:
-  void insert(void *pointer, ATLData *data);
-  void remove(void *pointer);
-  ATLData *find(const void *pointer);
-
-private:
-  MapTrackerType tracker_;
-  std::mutex mutex_;
-};
-
-extern ATLPointerTracker g_data_map; // Track all am pointer allocations.
-
-enum class Direction { ATMI_H2D, ATMI_D2H, ATMI_D2D, ATMI_H2H };
-
-} // namespace core
-#endif // SRC_RUNTIME_INCLUDE_DATA_H_

diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 8fd7b7e34155..dc3a288903f0 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -335,27 +335,28 @@ class RTLDeviceInfoTy {
   static const int Default_WG_Size =
       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
 
-  using MemcpyFunc = atmi_status_t(hsa_signal_t, void *, const void *,
-                                   size_t size);
+  using MemcpyFunc = atmi_status_t (*)(hsa_signal_t, void *, const void *,
+                                       size_t size, hsa_agent_t);
   atmi_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
-                                      MemcpyFunc Func) {
+                                      MemcpyFunc Func, int32_t deviceId) {
+    hsa_agent_t agent = HSAAgents[deviceId];
     hsa_signal_t s = FreeSignalPool.pop();
     if (s.handle == 0) {
       return ATMI_STATUS_ERROR;
     }
-    atmi_status_t r = Func(s, dest, src, size);
+    atmi_status_t r = Func(s, dest, src, size, agent);
     FreeSignalPool.push(s);
     return r;
   }
 
   atmi_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
-                                          size_t size) {
-    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h);
+                                          size_t size, int32_t deviceId) {
+    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId);
   }
 
   atmi_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
-                                          size_t size) {
-    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d);
+                                          size_t size, int32_t deviceId) {
+    return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId);
   }
 
   // Record entry point associated with device
@@ -550,7 +551,8 @@ int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
      (long long unsigned)(Elf64_Addr)TgtPtr,
      (long long unsigned)(Elf64_Addr)HstPtr);
 
-  err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size);
+  err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
+                                             DeviceId);
 
   if (err != ATMI_STATUS_SUCCESS) {
     DP("Error when copying data from device to host. Pointers: "
@@ -576,7 +578,8 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
      (long long unsigned)(Elf64_Addr)HstPtr,
      (long long unsigned)(Elf64_Addr)TgtPtr);
-  err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size);
+  err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
+                                             DeviceId);
   if (err != ATMI_STATUS_SUCCESS) {
     DP("Error when copying data from host to device. Pointers: "
        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -1033,7 +1036,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
     }
 
     // write ptr to device memory so it can be used by later kernels
-    err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *));
+    err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, sizeof(void *),
+                                               device_id);
     if (err != ATMI_STATUS_SUCCESS) {
       fprintf(stderr, "memcpy install of state_ptr failed\n");
       return NULL;
@@ -1103,7 +1107,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
         // can access host addresses directly. There is no longer a
         // need for device copies.
         err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
-                                                   sizeof(void *));
+                                                   sizeof(void *), device_id);
         if (err != ATMI_STATUS_SUCCESS)
           DP("Error when copying USM\n");
         DP("Copy linked variable host address (" DPxMOD ")"
@@ -1532,7 +1536,7 @@ static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
   atmi_status_t err =
       atmi_malloc(&TgtPtr, NestedMemSize, get_gpu_mem_place(device_id));
   err = DeviceInfo.freesignalpool_memcpy_h2d(CallStackAddr, &TgtPtr,
-                                             sizeof(void *));
+                                             sizeof(void *), device_id);
   if (print_kernel_trace > 2)
     fprintf(stderr, "CallSck %lx TgtPtr %lx *TgtPtr %lx \n",
             (long)CallStackAddr, (long)&TgtPtr, (long)TgtPtr);


        


More information about the Openmp-commits mailing list