[Openmp-commits] [openmp] a014fbb - [OpenMP] Improve D2D memcpy to use more efficient driver API

Shilei Tian via Openmp-commits openmp-commits at lists.llvm.org
Thu Jun 4 13:59:12 PDT 2020


Author: Shilei Tian
Date: 2020-06-04T16:59:06-04:00
New Revision: a014fbbc219fc8e1dbce382fd6f9280c3b720219

URL: https://github.com/llvm/llvm-project/commit/a014fbbc219fc8e1dbce382fd6f9280c3b720219
DIFF: https://github.com/llvm/llvm-project/commit/a014fbbc219fc8e1dbce382fd6f9280c3b720219.diff

LOG: [OpenMP] Improve D2D memcpy to use more efficient driver API

Summary:
In current implementation, D2D memcpy is first to copy data back to host and then
copy from host to device. This is very efficient if the device supports D2D
memcpy, like CUDA.

In this patch, D2D memcpy will first try to use native supported driver API. If
it fails, fall back to original way. It is worth noting that D2D memcpy in this
scenerio contains two ideas:
- Same devices: this is the D2D memcpy in the CUDA context.
- Different devices: this is the PeerToPeer memcpy in the CUDA context.
My implementation merges this two parts. It chooses the best API according to
the source device and destination device.

Reviewers: jdoerfert, AndreyChurbanov, grokos

Reviewed By: jdoerfert

Subscribers: yaxunl, guansong, sstefan1, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D80649

Added: 
    openmp/libomptarget/test/offloading/d2d_memcpy.c

Modified: 
    openmp/libomptarget/include/omptargetplugin.h
    openmp/libomptarget/plugins/cuda/src/rtl.cpp
    openmp/libomptarget/plugins/exports
    openmp/libomptarget/src/api.cpp
    openmp/libomptarget/src/device.cpp
    openmp/libomptarget/src/device.h
    openmp/libomptarget/src/rtl.cpp
    openmp/libomptarget/src/rtl.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h
index 083e422aac16..6785e77edbb4 100644
--- a/openmp/libomptarget/include/omptargetplugin.h
+++ b/openmp/libomptarget/include/omptargetplugin.h
@@ -31,6 +31,11 @@ int32_t __tgt_rtl_number_of_devices(void);
 // having to load the library, which can be expensive.
 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
 
+// Return an integer other than zero if the data can be exchaned from SrcDevId
+// to DstDevId. If it is data exchangable, the device plugin should provide
+// function to move data from source device to destination device directly.
+int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId);
+
 // Initialize the requires flags for the device.
 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);
 
@@ -77,6 +82,18 @@ int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr,
                                       void *TargetPtr, int64_t Size,
                                       __tgt_async_info *AsyncInfoPtr);
 
+// Copy the data content from one target device to another target device using
+// its address. This operation does not need to copy data back to host and then
+// from host to another device. In case of success, return zero. Otherwise,
+// return an error code.
+int32_t __tgt_rtl_data_exchange(int32_t SrcID, void *SrcPtr, int32_t DstID,
+                                void *DstPtr, int64_t Size);
+
+// Asynchronous version of __tgt_rtl_data_exchange
+int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
+                                      int32_t DesID, void *DstPtr, int64_t Size,
+                                      __tgt_async_info *AsyncInfoPtr);
+
 // De-allocate the data referenced by target ptr on the device. In case of
 // success, return zero. Otherwise, return an error code.
 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);

diff  --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 7e089a17370e..b877a6361824 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -100,6 +100,22 @@ bool checkResult(CUresult Err, const char *ErrMsg) {
   return false;
 }
 
+int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
+               CUstream Stream) {
+  CUresult Err =
+      cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
+
+  if (Err != CUDA_SUCCESS) {
+    DP("Error when copying data from device to device. Pointers: src "
+       "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
+       DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
+    CUDA_ERR_STRING(Err);
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
 // Structure contains per-device data
 struct DeviceDataTy {
   std::list<FuncOrGblEntryTy> FuncGblEntries;
@@ -736,6 +752,57 @@ class DeviceRTLTy {
     return OFFLOAD_SUCCESS;
   }
 
+  int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
+                   int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
+    assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+
+    CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
+    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return OFFLOAD_FAIL;
+
+    CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
+
+    // If they are two devices, we try peer to peer copy first
+    if (SrcDevId != DstDevId) {
+      int CanAccessPeer = 0;
+      Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
+      if (Err != CUDA_SUCCESS) {
+        DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
+           ", dst = %" PRId32 "\n",
+           SrcDevId, DstDevId);
+        CUDA_ERR_STRING(Err);
+        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+      }
+
+      if (!CanAccessPeer) {
+        DP("P2P memcpy not supported so fall back to D2D memcpy");
+        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+      }
+
+      Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
+      if (Err != CUDA_SUCCESS) {
+        DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
+           ", dst = %" PRId32 "\n",
+           SrcDevId, DstDevId);
+        CUDA_ERR_STRING(Err);
+        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+      }
+
+      Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
+                              (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
+                              Size, Stream);
+      if (Err == CUDA_SUCCESS)
+        return OFFLOAD_SUCCESS;
+
+      DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
+         ", src_id =%" PRId32 ", dst_ptr = %" DPxMOD ", dst_id =%" PRId32 "\n",
+         SrcPtr, SrcDevId, DstPtr, DstDevId);
+      CUDA_ERR_STRING(Err);
+    }
+
+    return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+  }
+
   int dataDelete(const int DeviceId, void *TgtPtr) const {
     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
@@ -900,6 +967,14 @@ int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
   return RequiresFlags;
 }
 
+int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
+  if (DeviceRTL.isValidDeviceId(src_dev_id) &&
+      DeviceRTL.isValidDeviceId(dst_dev_id))
+    return 1;
+
+  return 0;
+}
+
 int32_t __tgt_rtl_init_device(int32_t device_id) {
   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 
@@ -965,6 +1040,33 @@ int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
                                 async_info_ptr);
 }
 
+int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
+                                      int dst_dev_id, void *dst_ptr,
+                                      int64_t size,
+                                      __tgt_async_info *async_info_ptr) {
+  assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
+  assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
+  assert(async_info_ptr && "async_info_ptr is nullptr");
+
+  return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
+                                async_info_ptr);
+}
+
+int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
+                                int32_t dst_dev_id, void *dst_ptr,
+                                int64_t size) {
+  assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
+  assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
+
+  __tgt_async_info async_info;
+  const int32_t rc = __tgt_rtl_data_exchange_async(
+      src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
+  return __tgt_rtl_synchronize(src_dev_id, &async_info);
+}
+
 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
 

diff  --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports
index a4e1a3186daa..62bfc6e24d90 100644
--- a/openmp/libomptarget/plugins/exports
+++ b/openmp/libomptarget/plugins/exports
@@ -1,6 +1,7 @@
 VERS1.0 {
   global:
     __tgt_rtl_is_valid_binary;
+    __tgt_rtl_is_data_exchangable;
     __tgt_rtl_number_of_devices;
     __tgt_rtl_init_requires;
     __tgt_rtl_init_device;
@@ -10,6 +11,8 @@ VERS1.0 {
     __tgt_rtl_data_submit_async;
     __tgt_rtl_data_retrieve;
     __tgt_rtl_data_retrieve_async;
+    __tgt_rtl_data_exchange;
+    __tgt_rtl_data_exchange_async;
     __tgt_rtl_data_delete;
     __tgt_rtl_run_target_team_region;
     __tgt_rtl_run_target_team_region_async;

diff  --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 3c7b709fb894..7baeebeb0a2a 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -168,9 +168,17 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
     rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr);
   } else {
     DP("copy from device to device\n");
+    DeviceTy &SrcDev = Devices[src_device];
+    DeviceTy &DstDev = Devices[dst_device];
+    // First try to use D2D memcpy which is more efficient. If fails, fall back
+    // to unefficient way.
+    if (SrcDev.isDataExchangable(DstDev)) {
+      rc = SrcDev.data_exchange(srcAddr, DstDev, dstAddr, length, nullptr);
+      if (rc == OFFLOAD_SUCCESS)
+        return OFFLOAD_SUCCESS;
+    }
+
     void *buffer = malloc(length);
-    DeviceTy& SrcDev = Devices[src_device];
-    DeviceTy& DstDev = Devices[dst_device];
     rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr);
     if (rc == OFFLOAD_SUCCESS)
       rc = DstDev.data_submit(dstAddr, buffer, length, nullptr);

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index b613cd2eccb9..c526725502f9 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -352,6 +352,18 @@ int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
                                     AsyncInfoPtr);
 }
 
+// Copy data from current device to destination device directly
+int32_t DeviceTy::data_exchange(void *SrcPtr, DeviceTy DstDev, void *DstPtr,
+                                int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+  if (!AsyncInfoPtr || !RTL->data_exchange_async || !RTL->synchronize) {
+    assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
+    return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
+                              Size);
+  } else
+    return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
+                                    DstPtr, Size, AsyncInfoPtr);
+}
+
 // Run region on device
 int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
                              ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
@@ -380,6 +392,18 @@ int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
                                       ThreadLimit, LoopTripCount, AsyncInfoPtr);
 }
 
+// Whether data can be copied to DstDevice directly
+bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
+  if (RTL != DstDevice.RTL || !RTL->is_data_exchangable)
+    return false;
+
+  if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
+    return (RTL->data_exchange != nullptr) ||
+           (RTL->data_exchange_async != nullptr);
+
+  return false;
+}
+
 /// Check whether a device has an associated RTL and initialize it if it's not
 /// already initialized.
 bool device_is_ready(int device_num) {

diff  --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h
index a3a5767f81ff..1b7f776bd3b8 100644
--- a/openmp/libomptarget/src/device.h
+++ b/openmp/libomptarget/src/device.h
@@ -157,6 +157,9 @@ struct DeviceTy {
     return *this;
   }
 
+  // Return true if data can be copied to DstDevice directly
+  bool isDataExchangable(const DeviceTy& DstDevice);
+
   uint64_t getMapEntryRefCnt(void *HstPtrBegin);
   LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
   void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
@@ -176,10 +179,15 @@ struct DeviceTy {
 
   // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be
   // synchronous.
+  // Copy data from host to device
   int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
                       __tgt_async_info *AsyncInfoPtr);
+  // Copy data from device back to host
   int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
                         __tgt_async_info *AsyncInfoPtr);
+  // Copy data from current device to destination device directly
+  int32_t data_exchange(void *SrcPtr, DeviceTy DstDev, void *DstPtr,
+                        int64_t Size, __tgt_async_info *AsyncInfoPtr);
 
   int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
                      ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,

diff  --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 6ce4f88d4c49..7c344ca6ee66 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -140,6 +140,12 @@ void RTLsTy::LoadRTLs() {
     *((void **)&R.run_team_region_async) =
         dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async");
     *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize");
+    *((void **)&R.data_exchange) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_exchange");
+    *((void **)&R.data_exchange_async) =
+        dlsym(dynlib_handle, "__tgt_rtl_data_exchange_async");
+    *((void **)&R.is_data_exchangable) =
+        dlsym(dynlib_handle, "__tgt_rtl_is_data_exchangable");
 
     // No devices are supported by this RTL?
     if (!(R.NumberOfDevices = R.number_of_devices())) {

diff  --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h
index 86ecd6724a8d..941461f22b5c 100644
--- a/openmp/libomptarget/src/rtl.h
+++ b/openmp/libomptarget/src/rtl.h
@@ -26,6 +26,7 @@ struct __tgt_bin_desc;
 
 struct RTLInfoTy {
   typedef int32_t(is_valid_binary_ty)(void *);
+  typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t);
   typedef int32_t(number_of_devices_ty)();
   typedef int32_t(init_device_ty)(int32_t);
   typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
@@ -36,6 +37,9 @@ struct RTLInfoTy {
   typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
   typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t,
                                           __tgt_async_info *);
+  typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t);
+  typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *,
+                                          int64_t, __tgt_async_info *);
   typedef int32_t(data_delete_ty)(int32_t, void *);
   typedef int32_t(run_region_ty)(int32_t, void *, void **, ptr
diff _t *,
                                  int32_t);
@@ -64,6 +68,7 @@ struct RTLInfoTy {
 
   // Functions implemented in the RTL.
   is_valid_binary_ty *is_valid_binary = nullptr;
+  is_data_exchangable_ty *is_data_exchangable = nullptr;
   number_of_devices_ty *number_of_devices = nullptr;
   init_device_ty *init_device = nullptr;
   load_binary_ty *load_binary = nullptr;
@@ -72,6 +77,8 @@ struct RTLInfoTy {
   data_submit_async_ty *data_submit_async = nullptr;
   data_retrieve_ty *data_retrieve = nullptr;
   data_retrieve_async_ty *data_retrieve_async = nullptr;
+  data_exchange_ty *data_exchange = nullptr;
+  data_exchange_async_ty *data_exchange_async = nullptr;
   data_delete_ty *data_delete = nullptr;
   run_region_ty *run_region = nullptr;
   run_region_async_ty *run_region_async = nullptr;
@@ -100,6 +107,7 @@ struct RTLInfoTy {
     RTLName = r.RTLName;
 #endif
     is_valid_binary = r.is_valid_binary;
+    is_data_exchangable = r.is_data_exchangable;
     number_of_devices = r.number_of_devices;
     init_device = r.init_device;
     load_binary = r.load_binary;
@@ -108,6 +116,8 @@ struct RTLInfoTy {
     data_submit_async = r.data_submit_async;
     data_retrieve = r.data_retrieve;
     data_retrieve_async = r.data_retrieve_async;
+    data_exchange = r.data_exchange;
+    data_exchange_async = r.data_exchange_async;
     data_delete = r.data_delete;
     run_region = r.run_region;
     run_region_async = r.run_region_async;

diff  --git a/openmp/libomptarget/test/offloading/d2d_memcpy.c b/openmp/libomptarget/test/offloading/d2d_memcpy.c
new file mode 100644
index 000000000000..4c5f2c2ef5ff
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/d2d_memcpy.c
@@ -0,0 +1,69 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+const int magic_num = 7;
+
+int main(int argc, char *argv[]) {
+  const int N = 128;
+  const int num_devices = omp_get_num_devices();
+
+  // No target device, just return
+  if (num_devices == 0) {
+    printf("PASS\n");
+    return 0;
+  }
+
+  const int src_device = 0;
+  int dst_device = 1;
+  if (dst_device >= num_devices)
+    dst_device = num_devices - 1;
+
+  int length = N * sizeof(int);
+  int *src_ptr = omp_target_alloc(length, src_device);
+  int *dst_ptr = omp_target_alloc(length, dst_device);
+
+  assert(src_ptr && "src_ptr is NULL");
+  assert(dst_ptr && "dst_ptr is NULL");
+
+#pragma omp target teams distribute parallel for device(src_device) \
+                   is_device_ptr(src_ptr)
+  for (int i = 0; i < N; ++i) {
+    src_ptr[i] = magic_num;
+  }
+
+  int rc =
+      omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, src_device);
+
+  assert(rc == 0 && "error in omp_target_memcpy");
+
+  int *buffer = malloc(length);
+
+  assert(buffer && "failed to allocate host buffer");
+
+#pragma omp target teams distribute parallel for device(dst_device) \
+                   map(from: buffer[0:N]) is_device_ptr(dst_ptr)
+  for (int i = 0; i < N; ++i) {
+    buffer[i] = dst_ptr[i] + magic_num;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(buffer[i] == 2 * magic_num);
+
+  printf("PASS\n");
+
+  // Free host and device memory
+  free(buffer);
+  omp_target_free(src_ptr, src_device);
+  omp_target_free(dst_ptr, dst_device);
+
+  return 0;
+}
+
+// CHECK: PASS


        


More information about the Openmp-commits mailing list