[Openmp-commits] [openmp] 32ed292 - [OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream

Shilei Tian via Openmp-commits openmp-commits at lists.llvm.org
Tue Apr 7 11:55:53 PDT 2020


Author: Shilei Tian
Date: 2020-04-07T14:55:47-04:00
New Revision: 32ed29271fd8c56abee8616e5a16a3c9e58f4741

URL: https://github.com/llvm/llvm-project/commit/32ed29271fd8c56abee8616e5a16a3c9e58f4741
DIFF: https://github.com/llvm/llvm-project/commit/32ed29271fd8c56abee8616e5a16a3c9e58f4741.diff

LOG: [OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream

Summary:
This patch introduces two things for offloading:
1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future.
2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation.

Reviewers: jdoerfert, ye-luo

Reviewed By: jdoerfert

Subscribers: yaxunl, lildmh, guansong, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D77005

Added: 
    

Modified: 
    openmp/libomptarget/include/omptarget.h
    openmp/libomptarget/include/omptargetplugin.h
    openmp/libomptarget/plugins/cuda/src/rtl.cpp
    openmp/libomptarget/plugins/exports
    openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
    openmp/libomptarget/src/api.cpp
    openmp/libomptarget/src/device.cpp
    openmp/libomptarget/src/device.h
    openmp/libomptarget/src/omptarget.cpp
    openmp/libomptarget/src/private.h
    openmp/libomptarget/src/rtl.cpp
    openmp/libomptarget/src/rtl.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 826d8ed19802..de3afc36c7f2 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -111,6 +111,15 @@ struct __tgt_target_table {
       *EntriesEnd; // End of the table with all the entries (non inclusive)
 };
 
+/// This struct contains information exchanged between 
diff erent asynchronous
+/// operations for device-dependent optimization and potential synchronization
+struct __tgt_async_info {
+  // A pointer to a queue-like structure where offloading operations are issued.
+  // We assume to use this structure to do synchronization. In CUDA backend, it
+  // is CUstream.
+  void *Queue = nullptr;
+};
+
 #ifdef __cplusplus
 extern "C" {
 #endif

diff  --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h
index e03416ccf2dd..b330c1935282 100644
--- a/openmp/libomptarget/include/omptargetplugin.h
+++ b/openmp/libomptarget/include/omptargetplugin.h
@@ -58,15 +58,21 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
 // case an error occurred on the target device.
 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
 
-// Pass the data content to the target device using the target address.
-// In case of success, return zero. Otherwise, return an error code.
+// Pass the data content to the target device using the target address. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous. In case of success, return zero. Otherwise, return
+// an error code.
 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
-                              int64_t Size);
+                              int64_t Size, __tgt_async_info *AsyncInfoPtr);
 
-// Retrieve the data content from the target device using its address.
-// In case of success, return zero. Otherwise, return an error code.
+// Retrieve the data content from the target device using its address. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous. In case of success, return zero. Otherwise, return
+// an error code.
 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
-                                int64_t Size);
+                                int64_t Size, __tgt_async_info *AsyncInfoPtr);
 
 // De-allocate the data referenced by target ptr on the device. In case of
 // success, return zero. Otherwise, return an error code.
@@ -75,17 +81,28 @@ int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
 // Transfer control to the offloaded entry Entry on the target device.
 // Args and Offsets are arrays of NumArgs size of target addresses and
 // offsets. An offset should be added to the target address before passing it
-// to the outlined function on device side. In case of success, return zero.
-// Otherwise, return an error code.
+// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is
+// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be
+// ignored on some platforms, like x86_64. In that case, it is synchronous. In
+// case of success, return zero. Otherwise, return an error code.
 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
-                                    ptr
diff _t *Offsets, int32_t NumArgs);
+                                    ptr
diff _t *Offsets, int32_t NumArgs,
+                                    __tgt_async_info *AsyncInfoPtr);
 
 // Similar to __tgt_rtl_run_target_region, but additionally specify the
-// number of teams to be created and a number of threads in each team.
+// number of teams to be created and a number of threads in each team. If
+// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous.
+// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that
+// case, it is synchronous.
 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
                                          ptr
diff _t *Offsets, int32_t NumArgs,
                                          int32_t NumTeams, int32_t ThreadLimit,
-                                         uint64_t loop_tripcount);
+                                         uint64_t loop_tripcount,
+                                         __tgt_async_info *AsyncInfoPtr);
+
+// Device synchronization. In case of success, return zero. Otherwise, return an
+// error code.
+int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr);
 
 #ifdef __cplusplus
 }

diff  --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 54248daa7f19..c0fb87b8e19d 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -309,6 +309,68 @@ class RTLDeviceInfoTy {
 
 static RTLDeviceInfoTy DeviceInfo;
 
+namespace {
+CUstream selectStream(int32_t Id, __tgt_async_info *AsyncInfo) {
+  if (!AsyncInfo)
+    return DeviceInfo.getNextStream(Id);
+
+  if (!AsyncInfo->Queue)
+    AsyncInfo->Queue = DeviceInfo.getNextStream(Id);
+
+  return reinterpret_cast<CUstream>(AsyncInfo->Queue);
+}
+
+int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
+                     __tgt_async_info *AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  CUstream Stream = selectStream(DeviceId, AsyncInfoPtr);
+
+  err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+       ", device = " DPxMOD ", size = %" PRId64 "\n",
+       DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
+                   __tgt_async_info *AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[DeviceId]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  CUstream Stream = selectStream(DeviceId, AsyncInfoPtr);
+
+  err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+       ", device = " DPxMOD ", size = %" PRId64 "\n",
+       DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+} // namespace
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -663,69 +725,38 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
 }
 
 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
-    int64_t size) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
+                              int64_t size, __tgt_async_info *async_info_ptr) {
+  // The function dataSubmit is always asynchronous. Considering some data
+  // transfer must be synchronous, we assume if async_info_ptr is nullptr, the
+  // transfer will be synchronous by creating a temporary async info and then
+  // synchronizing after call dataSubmit; otherwise, it is asynchronous.
+  if (async_info_ptr)
+    return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr);
+
+  __tgt_async_info async_info;
+  int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
-  }
-
-  CUstream &Stream = DeviceInfo.getNextStream(device_id);
-
-  err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when copying data from host to device. Pointers: host = " DPxMOD
-       ", device = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
 
-  err = cuStreamSynchronize(Stream);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when synchronizing async data transfer from host to device. "
-       "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
+  return __tgt_rtl_synchronize(device_id, &async_info);
 }
 
 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
-    int64_t size) {
-  // Set the context we are using.
-  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when setting CUDA context\n");
-    CUDA_ERR_STRING(err);
+                                int64_t size,
+                                __tgt_async_info *async_info_ptr) {
+  // The function dataRetrieve is always asynchronous. Considering some data
+  // transfer must be synchronous, we assume if async_info_ptr is nullptr, the
+  // transfer will be synchronous by creating a temporary async info and then
+  // synchronizing after call dataRetrieve; otherwise, it is asynchronous.
+  if (async_info_ptr)
+    return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr);
+
+  __tgt_async_info async_info;
+  int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info);
+  if (rc != OFFLOAD_SUCCESS)
     return OFFLOAD_FAIL;
-  }
 
-  CUstream &Stream = DeviceInfo.getNextStream(device_id);
-
-  err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when copying data from device to host. Pointers: host = " DPxMOD
-       ", device = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-
-  err = cuStreamSynchronize(Stream);
-  if (err != CUDA_SUCCESS) {
-    DP("Error when synchronizing async data transfer from device to host. "
-       "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
-    CUDA_ERR_STRING(err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
+  return __tgt_rtl_synchronize(device_id, &async_info);
 }
 
 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
@@ -747,8 +778,12 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
 }
 
 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t team_num,
-    int32_t thread_limit, uint64_t loop_tripcount) {
+                                         void **tgt_args,
+                                         ptr
diff _t *tgt_offsets,
+                                         int32_t arg_num, int32_t team_num,
+                                         int32_t thread_limit,
+                                         uint64_t loop_tripcount,
+                                         __tgt_async_info *async_info) {
   // Set the context we are using.
   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
   if (err != CUDA_SUCCESS) {
@@ -844,8 +879,7 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
      cudaThreadsPerBlock);
 
-  CUstream &Stream = DeviceInfo.getNextStream(device_id);
-
+  CUstream Stream = selectStream(device_id, async_info);
   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
                        cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/,
                        Stream, &args[0], 0);
@@ -858,25 +892,35 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
   DP("Launch of entry point at " DPxMOD " successful!\n",
       DPxPTR(tgt_entry_ptr));
 
-  CUresult sync_err = cuStreamSynchronize(Stream);
-  if (sync_err != CUDA_SUCCESS) {
-    DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
-    CUDA_ERR_STRING(sync_err);
-    return OFFLOAD_FAIL;
-  } else {
-    DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
-  }
-
   return OFFLOAD_SUCCESS;
 }
 
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptr
diff _t *tgt_offsets, int32_t arg_num) {
+                                    void **tgt_args, ptr
diff _t *tgt_offsets,
+                                    int32_t arg_num,
+                                    __tgt_async_info *async_info) {
   // use one team and the default number of threads.
   const int32_t team_num = 1;
   const int32_t thread_limit = 0;
   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
-      tgt_offsets, arg_num, team_num, thread_limit, 0);
+                                          tgt_offsets, arg_num, team_num,
+                                          thread_limit, 0, async_info);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) {
+  assert(async_info && "async_info is nullptr");
+  assert(async_info->Queue && "async_info->Queue is nullptr");
+
+  CUstream Stream = reinterpret_cast<CUstream>(async_info->Queue);
+  CUresult Err = cuStreamSynchronize(Stream);
+  if (Err != CUDA_SUCCESS) {
+    DP("Error when synchronizing stream. stream = " DPxMOD
+       ", async info ptr = " DPxMOD "\n",
+       DPxPTR(Stream), DPxPTR(async_info));
+    CUDA_ERR_STRING(Err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
 }
 
 #ifdef __cplusplus

diff  --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports
index a14bedf0791a..cbbad6d0364d 100644
--- a/openmp/libomptarget/plugins/exports
+++ b/openmp/libomptarget/plugins/exports
@@ -11,6 +11,7 @@ VERS1.0 {
     __tgt_rtl_data_delete;
     __tgt_rtl_run_target_team_region;
     __tgt_rtl_run_target_region;
+    __tgt_rtl_synchronize;
   local:
     *;
 };

diff  --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
index 00e58d870f4c..84875f591ac0 100644
--- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -277,13 +277,13 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
 }
 
 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
-                              int64_t size) {
+                              int64_t size, __tgt_async_info *) {
   memcpy(tgt_ptr, hst_ptr, size);
   return OFFLOAD_SUCCESS;
 }
 
 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
-                                int64_t size) {
+                                int64_t size, __tgt_async_info *) {
   memcpy(hst_ptr, tgt_ptr, size);
   return OFFLOAD_SUCCESS;
 }
@@ -293,9 +293,11 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
   return OFFLOAD_SUCCESS;
 }
 
-int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t team_num,
-    int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
+int32_t __tgt_rtl_run_target_team_region(
+    int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
+    ptr
diff _t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount /*not used*/,
+    __tgt_async_info *async_info /*not used*/) {
   // ignore team num and thread limit.
 
   // Use libffi to launch execution.
@@ -328,10 +330,18 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
 }
 
 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
-    void **tgt_args, ptr
diff _t *tgt_offsets, int32_t arg_num) {
+                                    void **tgt_args, ptr
diff _t *tgt_offsets,
+                                    int32_t arg_num,
+                                    __tgt_async_info *async_info_ptr) {
   // use one team and one thread.
   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
-      tgt_offsets, arg_num, 1, 1, 0);
+                                          tgt_offsets, arg_num, 1, 1, 0,
+                                          async_info_ptr);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id,
+                              __tgt_async_info *async_info_ptr) {
+  return OFFLOAD_SUCCESS;
 }
 
 #ifdef __cplusplus

diff  --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index cabe63927f5a..3c7b709fb894 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -161,19 +161,19 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
   } else if (src_device == omp_get_initial_device()) {
     DP("copy from host to device\n");
     DeviceTy& DstDev = Devices[dst_device];
-    rc = DstDev.data_submit(dstAddr, srcAddr, length);
+    rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr);
   } else if (dst_device == omp_get_initial_device()) {
     DP("copy from device to host\n");
     DeviceTy& SrcDev = Devices[src_device];
-    rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
+    rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr);
   } else {
     DP("copy from device to device\n");
     void *buffer = malloc(length);
     DeviceTy& SrcDev = Devices[src_device];
     DeviceTy& DstDev = Devices[dst_device];
-    rc = SrcDev.data_retrieve(buffer, srcAddr, length);
+    rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr);
     if (rc == OFFLOAD_SUCCESS)
-      rc = DstDev.data_submit(dstAddr, buffer, length);
+      rc = DstDev.data_submit(dstAddr, buffer, length, nullptr);
     free(buffer);
   }
 

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index e215a5d6395d..09ddcceff9ea 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -331,31 +331,38 @@ __tgt_target_table *DeviceTy::load_binary(void *Img) {
   return rc;
 }
 
-// Submit data to device.
+// Submit data to device
 int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
-    int64_t Size) {
-  return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+                              int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+
+  return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
+                          AsyncInfoPtr);
 }
 
-// Retrieve data from device.
+// Retrieve data from device
 int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
-    int64_t Size) {
-  return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+                                int64_t Size, __tgt_async_info *AsyncInfoPtr) {
+  return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
+                            AsyncInfoPtr);
 }
 
 // Run region on device
 int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
-    ptr
diff _t *TgtOffsets, int32_t TgtVarsSize) {
+                             ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
+                             __tgt_async_info *AsyncInfo) {
   return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
-      TgtVarsSize);
+                         TgtVarsSize, AsyncInfo);
 }
 
 // Run team region on device.
 int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
-    ptr
diff _t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
-    int32_t ThreadLimit, uint64_t LoopTripCount) {
+                                  ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
+                                  int32_t NumTeams, int32_t ThreadLimit,
+                                  uint64_t LoopTripCount,
+                                  __tgt_async_info *AsyncInfo) {
   return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
-      TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount);
+                              TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount,
+                              AsyncInfo);
 }
 
 /// Check whether a device has an associated RTL and initialize it if it's not

diff  --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h
index 8379f0c65ae4..e44adaf70e4e 100644
--- a/openmp/libomptarget/src/device.h
+++ b/openmp/libomptarget/src/device.h
@@ -24,6 +24,7 @@
 struct RTLInfoTy;
 struct __tgt_bin_desc;
 struct __tgt_target_table;
+struct __tgt_async_info;
 
 /// Map between host data and target data.
 struct HostDataToTargetTy {
@@ -173,14 +174,20 @@ struct DeviceTy {
   int32_t initOnce();
   __tgt_target_table *load_binary(void *Img);
 
-  int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
-  int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
+  // Asynchronous data transfer. When AsyncInfoPtr is nullptr, the transfer will
+  // be synchronous.
+  int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
+                      __tgt_async_info *AsyncInfoPtr);
+  int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
+                        __tgt_async_info *AsyncInfoPtr);
 
   int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
-      ptr
diff _t *TgtOffsets, int32_t TgtVarsSize);
+                     ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
+                     __tgt_async_info *AsyncInfo);
   int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
-      ptr
diff _t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
-      int32_t ThreadLimit, uint64_t LoopTripCount);
+                          ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
+                          int32_t NumTeams, int32_t ThreadLimit,
+                          uint64_t LoopTripCount, __tgt_async_info *AsyncInfo);
 
 private:
   // Call to RTL

diff  --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index b84cc882fda5..4517a89726e3 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -215,8 +215,9 @@ static int32_t member_of(int64_t type) {
 }
 
 /// Internal function to do the mapping and transfer the data to the device
-int target_data_begin(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base,
+                      void **args, int64_t *arg_sizes, int64_t *arg_types,
+                      __tgt_async_info *async_info_ptr) {
   // process each input.
   for (int32_t i = 0; i < arg_num; ++i) {
     // Ignore private variables and arrays - there is no mapping for them.
@@ -316,8 +317,9 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
 
       if (copy && !IsHostPtr) {
         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
-            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
+           data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size,
+                                    async_info_ptr);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed.\n");
           return OFFLOAD_FAIL;
@@ -331,7 +333,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
       void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
       int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
-          sizeof(void *));
+                                  sizeof(void *), async_info_ptr);
       if (rt != OFFLOAD_SUCCESS) {
         DP("Copying data to device failed.\n");
         return OFFLOAD_FAIL;
@@ -349,7 +351,8 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
 
 /// Internal function to undo the mapping and retrieve the data from the device.
 int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
-    void **args, int64_t *arg_sizes, int64_t *arg_types) {
+                    void **args, int64_t *arg_sizes, int64_t *arg_types,
+                    __tgt_async_info *async_info_ptr) {
   // process each input.
   for (int32_t i = arg_num - 1; i >= 0; --i) {
     // Ignore private variables and arrays - there is no mapping for them.
@@ -419,8 +422,9 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
             !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
               TgtPtrBegin == HstPtrBegin)) {
           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
-              data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
+             data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size,
+                                        async_info_ptr);
           if (rt != OFFLOAD_SUCCESS) {
             DP("Copying data from device failed.\n");
             return OFFLOAD_FAIL;
@@ -509,7 +513,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
     if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
           arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-      int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
+      int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr);
       if (rt != OFFLOAD_SUCCESS) {
         DP("Copying data from device failed.\n");
         return OFFLOAD_FAIL;
@@ -536,7 +540,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
       DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
           arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
-      int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
+      int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr);
       if (rt != OFFLOAD_SUCCESS) {
         DP("Copying data to device failed.\n");
         return OFFLOAD_FAIL;
@@ -556,7 +560,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
             "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
             DPxPTR(it->second.TgtPtrAddr));
         rt = Device.data_submit(it->second.TgtPtrAddr,
-            &it->second.TgtPtrVal, sizeof(void *));
+            &it->second.TgtPtrVal, sizeof(void *), nullptr);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed.\n");
           Device.ShadowMtx.unlock();
@@ -638,9 +642,11 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
   TrlTblMtx->unlock();
   assert(TargetTable && "Global data has not been mapped\n");
 
+  __tgt_async_info AsyncInfo;
+
   // Move data to device.
   int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
-      arg_types);
+                             arg_types, &AsyncInfo);
   if (rc != OFFLOAD_SUCCESS) {
     DP("Call to target_data_begin failed, abort target.\n");
     return OFFLOAD_FAIL;
@@ -691,7 +697,7 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
         int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
-                                    sizeof(void *));
+                                    sizeof(void *), &AsyncInfo);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Copying data to device failed.\n");
           return OFFLOAD_FAIL;
@@ -732,9 +738,10 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
 #endif
       // If first-private, copy data from host
       if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
-        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i],
+                                    &AsyncInfo);
         if (rt != OFFLOAD_SUCCESS) {
-          DP ("Copying data to device failed, failed.\n");
+          DP("Copying data to device failed, failed.\n");
           return OFFLOAD_FAIL;
         }
       }
@@ -780,11 +787,12 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
       DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
   if (IsTeamConstruct) {
     rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
-        &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
-        thread_limit, ltc);
+                                &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
+                                team_num, thread_limit, ltc, &AsyncInfo);
   } else {
     rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
-        &tgt_args[0], &tgt_offsets[0], tgt_args.size());
+                           &tgt_args[0], &tgt_offsets[0], tgt_args.size(),
+                           &AsyncInfo);
   }
   if (rc != OFFLOAD_SUCCESS) {
     DP ("Executing target region abort target.\n");
@@ -802,11 +810,11 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
 
   // Move data from device.
   int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
-      arg_types);
+                           arg_types, &AsyncInfo);
   if (rt != OFFLOAD_SUCCESS) {
     DP("Call to target_data_end failed, abort targe.\n");
     return OFFLOAD_FAIL;
   }
 
-  return OFFLOAD_SUCCESS;
+  return Device.RTL->synchronize(device_id, &AsyncInfo);
 }

diff  --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 7aaa295d9686..6e6b39f3fdca 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -18,10 +18,13 @@
 #include <cstdint>
 
 extern int target_data_begin(DeviceTy &Device, int32_t arg_num,
-    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+                             void **args_base, void **args, int64_t *arg_sizes,
+                             int64_t *arg_types,
+                             __tgt_async_info *async_info_ptr = nullptr);
 
 extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
-    void **args, int64_t *arg_sizes, int64_t *arg_types);
+                           void **args, int64_t *arg_sizes, int64_t *arg_types,
+                           __tgt_async_info *async_info_ptr = nullptr);
 
 extern int target_data_update(DeviceTy &Device, int32_t arg_num,
     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);

diff  --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index 3b9efd6ecbdf..ed0be2c2ee53 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -126,6 +126,9 @@ void RTLsTy::LoadRTLs() {
     if (!(*((void**) &R.run_team_region) = dlsym(
               dynlib_handle, "__tgt_rtl_run_target_team_region")))
       continue;
+    if (!(*((void**) &R.synchronize) = dlsym(
+              dynlib_handle, "__tgt_rtl_synchronize")))
+      continue;
 
     // Optional functions
     *((void**) &R.init_requires) = dlsym(

diff  --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h
index 439efec4ad91..846c89b0ed2e 100644
--- a/openmp/libomptarget/src/rtl.h
+++ b/openmp/libomptarget/src/rtl.h
@@ -30,14 +30,18 @@ struct RTLInfoTy {
   typedef int32_t(init_device_ty)(int32_t);
   typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
   typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
-  typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
-  typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t,
+                                  __tgt_async_info *);
+  typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t,
+                                    __tgt_async_info *);
   typedef int32_t(data_delete_ty)(int32_t, void *);
-  typedef int32_t(run_region_ty)(int32_t, void *, void **, ptr
diff _t *,
-                                 int32_t);
+  typedef int32_t(run_region_ty)(int32_t, void *, void **, ptr
diff _t *, int32_t,
+                                 __tgt_async_info *);
   typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptr
diff _t *,
-                                      int32_t, int32_t, int32_t, uint64_t);
+                                      int32_t, int32_t, int32_t, uint64_t,
+                                      __tgt_async_info *);
   typedef int64_t(init_requires_ty)(int64_t);
+  typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *);
 
   int32_t Idx = -1;             // RTL index, index is the number of devices
                                 // of other RTLs that were registered before,
@@ -63,6 +67,7 @@ struct RTLInfoTy {
   run_region_ty *run_region = nullptr;
   run_team_region_ty *run_team_region = nullptr;
   init_requires_ty *init_requires = nullptr;
+  synchronize_ty *synchronize = nullptr;
 
   // Are there images associated with this RTL.
   bool isUsed = false;
@@ -95,6 +100,7 @@ struct RTLInfoTy {
     run_team_region = r.run_team_region;
     init_requires = r.init_requires;
     isUsed = r.isUsed;
+    synchronize = r.synchronize;
   }
 };
 


        


More information about the Openmp-commits mailing list