[Openmp-commits] [openmp] 758b849 - [OpenMP] Unify omptarget API and usage wrt. `__tgt_async_info`
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Tue Feb 16 13:38:12 PST 2021
Author: Johannes Doerfert
Date: 2021-02-16T15:38:06-06:00
New Revision: 758b8499310a5d44a41de67d656bd32dc3fd1023
URL: https://github.com/llvm/llvm-project/commit/758b8499310a5d44a41de67d656bd32dc3fd1023
DIFF: https://github.com/llvm/llvm-project/commit/758b8499310a5d44a41de67d656bd32dc3fd1023.diff
LOG: [OpenMP] Unify omptarget API and usage wrt. `__tgt_async_info`
This patch unifies our libomptarget API in two ways:
- always pass a `__tgt_async_info` object, the Queue member decides if
it is in use or not.
- (almost) always synchronize in the interface layer and not in the
omptarget layer.
A side effect is that we now put all constructor and static initializer
kernels in a stream too, if the device utilizes `__tgt_async_info`.
The patch contains a TODO which can be addressed as we add support for
asynchronous malloc and free in the plugin API. This is the only
`synchronizeAsyncInfo` left in the omptarget layer.
Site note: On a V100 system the GridMini performance for small sizes
more than doubled.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D96379
Added:
Modified:
openmp/libomptarget/src/api.cpp
openmp/libomptarget/src/device.cpp
openmp/libomptarget/src/device.h
openmp/libomptarget/src/interface.cpp
openmp/libomptarget/src/omptarget.cpp
openmp/libomptarget/src/private.h
openmp/libomptarget/src/rtl.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 27fec34be631..adacc5ae4c01 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "device.h"
+#include "omptarget.h"
#include "private.h"
#include "rtl.h"
@@ -171,11 +172,13 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
} else if (src_device == omp_get_initial_device()) {
DP("copy from host to device\n");
DeviceTy &DstDev = PM->Devices[dst_device];
- rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr);
+ AsyncInfoTy AsyncInfo(DstDev);
+ rc = DstDev.submitData(dstAddr, srcAddr, length, AsyncInfo);
} else if (dst_device == omp_get_initial_device()) {
DP("copy from device to host\n");
DeviceTy &SrcDev = PM->Devices[src_device];
- rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr);
+ AsyncInfoTy AsyncInfo(SrcDev);
+ rc = SrcDev.retrieveData(dstAddr, srcAddr, length, AsyncInfo);
} else {
DP("copy from device to device\n");
DeviceTy &SrcDev = PM->Devices[src_device];
@@ -183,15 +186,21 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
// First try to use D2D memcpy which is more efficient. If fails, fall back
// to unefficient way.
if (SrcDev.isDataExchangable(DstDev)) {
- rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr);
+ AsyncInfoTy AsyncInfo(SrcDev);
+ rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, AsyncInfo);
if (rc == OFFLOAD_SUCCESS)
return OFFLOAD_SUCCESS;
}
void *buffer = malloc(length);
- rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr);
- if (rc == OFFLOAD_SUCCESS)
- rc = DstDev.submitData(dstAddr, buffer, length, nullptr);
+ {
+ AsyncInfoTy AsyncInfo(SrcDev);
+ rc = SrcDev.retrieveData(buffer, srcAddr, length, AsyncInfo);
+ }
+ if (rc == OFFLOAD_SUCCESS) {
+ AsyncInfoTy AsyncInfo(SrcDev);
+ rc = DstDev.submitData(dstAddr, buffer, length, AsyncInfo);
+ }
free(buffer);
}
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 724b8871ebba..50017ac1e906 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -415,27 +415,27 @@ int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
// Submit data to device
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
- __tgt_async_info *AsyncInfoPtr) {
- if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
+ AsyncInfoTy &AsyncInfo) {
+ if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
else
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
- AsyncInfoPtr);
+ AsyncInfo);
}
// Retrieve data from device
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
- int64_t Size, __tgt_async_info *AsyncInfoPtr) {
- if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
+ int64_t Size, AsyncInfoTy &AsyncInfo) {
+ if (!RTL->data_retrieve_async || !RTL->synchronize)
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
else
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
- AsyncInfoPtr);
+ AsyncInfo);
}
// Copy data from current device to destination device directly
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
- int64_t Size, __tgt_async_info *AsyncInfo) {
+ int64_t Size, AsyncInfoTy &AsyncInfo) {
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
@@ -448,13 +448,13 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
// Run region on device
int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
- __tgt_async_info *AsyncInfoPtr) {
- if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
+ AsyncInfoTy &AsyncInfo) {
+ if (!RTL->run_region || !RTL->synchronize)
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
TgtVarsSize);
else
return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
- TgtOffsets, TgtVarsSize, AsyncInfoPtr);
+ TgtOffsets, TgtVarsSize, AsyncInfo);
}
// Run team region on device.
@@ -462,15 +462,15 @@ int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount,
- __tgt_async_info *AsyncInfoPtr) {
- if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
+ AsyncInfoTy &AsyncInfo) {
+ if (!RTL->run_team_region_async || !RTL->synchronize)
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
LoopTripCount);
else
return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams,
- ThreadLimit, LoopTripCount, AsyncInfoPtr);
+ ThreadLimit, LoopTripCount, AsyncInfo);
}
// Whether data can be copied to DstDevice directly
@@ -485,9 +485,9 @@ bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
return false;
}
-int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) {
+int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
if (RTL->synchronize)
- return RTL->synchronize(RTLDeviceID, AsyncInfoPtr);
+ return RTL->synchronize(RTLDeviceID, AsyncInfo);
return OFFLOAD_SUCCESS;
}
diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h
index 06e24e16606c..84a0987e30fe 100644
--- a/openmp/libomptarget/src/device.h
+++ b/openmp/libomptarget/src/device.h
@@ -22,13 +22,13 @@
#include <set>
#include <vector>
+#include "omptarget.h"
#include "rtl.h"
// Forward declarations.
struct RTLInfoTy;
struct __tgt_bin_desc;
struct __tgt_target_table;
-struct __tgt_async_info;
using map_var_info_t = void *;
@@ -200,24 +200,24 @@ struct DeviceTy {
// synchronous.
// Copy data from host to device
int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
- __tgt_async_info *AsyncInfoPtr);
+ AsyncInfoTy &AsyncInfo);
// Copy data from device back to host
int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
- __tgt_async_info *AsyncInfoPtr);
+ AsyncInfoTy &AsyncInfo);
// Copy data from current device to destination device directly
int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
- int64_t Size, __tgt_async_info *AsyncInfo);
+ int64_t Size, AsyncInfoTy &AsyncInfo);
int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptr
diff _t *TgtOffsets,
- int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr);
+ int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo);
int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptr
diff _t *TgtOffsets, int32_t TgtVarsSize,
int32_t NumTeams, int32_t ThreadLimit,
- uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr);
+ uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
/// Synchronize device/queue/event based on \p AsyncInfoPtr and return
/// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
- int32_t synchronize(__tgt_async_info *AsyncInfoPtr);
+ int32_t synchronize(AsyncInfoTy &AsyncInfo);
private:
// Call to RTL
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 0140c5592028..095c4d31d1a8 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "device.h"
+#include "omptarget.h"
#include "private.h"
#include "rtl.h"
@@ -183,8 +184,11 @@ EXTERN void __tgt_target_data_begin_mapper(ident_t *loc, int64_t device_id,
}
#endif
+ AsyncInfoTy AsyncInfo(Device);
int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes,
- arg_types, arg_names, arg_mappers, nullptr);
+ arg_types, arg_names, arg_mappers, AsyncInfo);
+ if (rc == OFFLOAD_SUCCESS)
+ rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
}
@@ -270,8 +274,11 @@ EXTERN void __tgt_target_data_end_mapper(ident_t *loc, int64_t device_id,
}
#endif
+ AsyncInfoTy AsyncInfo(Device);
int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes,
- arg_types, arg_names, arg_mappers, nullptr);
+ arg_types, arg_names, arg_mappers, AsyncInfo);
+ if (rc == OFFLOAD_SUCCESS)
+ rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
}
@@ -335,8 +342,11 @@ EXTERN void __tgt_target_data_update_mapper(ident_t *loc, int64_t device_id,
arg_names, "Updating OpenMP data");
DeviceTy &Device = PM->Devices[device_id];
+ AsyncInfoTy AsyncInfo(Device);
int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes,
- arg_types, arg_names, arg_mappers, nullptr);
+ arg_types, arg_names, arg_mappers, AsyncInfo);
+ if (rc == OFFLOAD_SUCCESS)
+ rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
}
@@ -408,9 +418,12 @@ EXTERN int __tgt_target_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
#endif
DeviceTy &Device = PM->Devices[device_id];
- int rc =
- target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
- arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, nullptr);
+ AsyncInfoTy AsyncInfo(Device);
+ int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
+ arg_types, arg_names, arg_mappers, 0, 0, false /*team*/,
+ AsyncInfo);
+ if (rc == OFFLOAD_SUCCESS)
+ rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
return rc;
}
@@ -490,9 +503,12 @@ EXTERN int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id,
#endif
DeviceTy &Device = PM->Devices[device_id];
+ AsyncInfoTy AsyncInfo(Device);
int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
arg_types, arg_names, arg_mappers, team_num, thread_limit,
- true /*team*/, nullptr);
+ true /*team*/, AsyncInfo);
+ if (rc == OFFLOAD_SUCCESS)
+ rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
return rc;
}
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index dc493459fb47..51dcfecbad60 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -23,7 +23,7 @@ int AsyncInfoTy::synchronize() {
int Result = OFFLOAD_SUCCESS;
if (AsyncInfo.Queue) {
// If we have a queue we need to synchronize it now.
- Result = Device.synchronize(&AsyncInfo);
+ Result = Device.synchronize(*this);
assert(AsyncInfo.Queue == nullptr &&
"The device plugin should have nulled the queue to indicate there "
"are no outstanding actions!");
@@ -166,6 +166,7 @@ static int InitLibrary(DeviceTy &Device) {
* Run ctors for static objects
*/
if (!Device.PendingCtorsDtors.empty()) {
+ AsyncInfoTy AsyncInfo(Device);
// Call all ctors for all libraries registered so far
for (auto &lib : Device.PendingCtorsDtors) {
if (!lib.second.PendingCtors.empty()) {
@@ -174,7 +175,7 @@ static int InitLibrary(DeviceTy &Device) {
void *ctor = entry;
int rc =
target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
- nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
+ nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
if (rc != OFFLOAD_SUCCESS) {
REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
Device.PendingGlobalsMtx.unlock();
@@ -186,6 +187,9 @@ static int InitLibrary(DeviceTy &Device) {
DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
}
}
+ // All constructors have been issued, wait for them now.
+ if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
}
Device.HasPendingGlobals = false;
Device.PendingGlobalsMtx.unlock();
@@ -226,6 +230,7 @@ static int32_t getParentIndex(int64_t type) {
int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
int64_t arg_size, int64_t arg_type,
map_var_info_t arg_names, void *arg_mapper,
+ AsyncInfoTy &AsyncInfo,
TargetDataFuncPtrTy target_data_function) {
TIMESCOPE_WITH_IDENT(loc);
DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
@@ -256,11 +261,10 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
MapperArgNames[I] = C.Name;
}
- int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
- MapperArgsBase.data(), MapperArgs.data(),
- MapperArgSizes.data(), MapperArgTypes.data(),
- MapperArgNames.data(), /*arg_mappers*/ nullptr,
- /* AsyncInfoTy */ nullptr);
+ int rc = target_data_function(
+ loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(),
+ MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(),
+ MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo);
return rc;
}
@@ -269,7 +273,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
void **args_base, void **args, int64_t *arg_sizes,
int64_t *arg_types, map_var_info_t *arg_names,
- void **arg_mappers, AsyncInfoTy *AsyncInfo) {
+ void **arg_mappers, AsyncInfoTy &AsyncInfo) {
// process each input.
for (int32_t i = 0; i < arg_num; ++i) {
// Ignore private variables and arrays - there is no mapping for them.
@@ -286,7 +290,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
int rc = targetDataMapper(loc, Device, args_base[i], args[i],
arg_sizes[i], arg_types[i], arg_name,
- arg_mappers[i], targetDataBegin);
+ arg_mappers[i], AsyncInfo, targetDataBegin);
if (rc != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
@@ -416,7 +420,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
int rt =
- Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, *AsyncInfo);
+ Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo);
if (rt != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@@ -430,7 +434,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
- sizeof(void *), *AsyncInfo);
+ sizeof(void *), AsyncInfo);
if (rt != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@@ -470,7 +474,7 @@ struct DeallocTgtPtrInfo {
int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
- void **ArgMappers, AsyncInfoTy *AsyncInfo) {
+ void **ArgMappers, AsyncInfoTy &AsyncInfo) {
int Ret;
std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
// process each input.
@@ -488,9 +492,9 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
DP("Calling targetDataMapper for the %dth argument\n", I);
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
- Ret =
- targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
- ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd);
+ Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
+ ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
+ targetDataEnd);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
@@ -585,7 +589,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
- *AsyncInfo);
+ AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data from device failed.\n");
return OFFLOAD_FAIL;
@@ -637,17 +641,13 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
}
}
+ // TODO: We should not synchronize here but pass the AsyncInfo object to the
+ // allocate/deallocate device APIs.
+ //
// We need to synchronize before deallocating data.
- // If AsyncInfo is nullptr, the previous data transfer (if has) will be
- // synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
- // nullptr, there is no data transfer happened because once there is,
- // AsyncInfo->Queue will not be nullptr, so again, we don't need to
- // synchronize.
- if (AsyncInfo) {
- Ret = AsyncInfo->synchronize();
- if (Ret != OFFLOAD_SUCCESS)
- return OFFLOAD_FAIL;
- }
+ Ret = AsyncInfo.synchronize();
+ if (Ret != OFFLOAD_SUCCESS)
+ return OFFLOAD_FAIL;
// Deallocate target pointer
for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
@@ -664,7 +664,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
void *HstPtrBegin, int64_t ArgSize,
- int64_t ArgType) {
+ int64_t ArgType, AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_IDENT(loc);
bool IsLast, IsHostPtr;
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
@@ -690,7 +690,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
if (ArgType & OMP_TGT_MAPTYPE_FROM) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
+ int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data from device failed.\n");
return OFFLOAD_FAIL;
@@ -717,7 +717,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
if (ArgType & OMP_TGT_MAPTYPE_TO) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
- int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
+ int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@@ -737,7 +737,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
"pointer " DPxMOD "\n",
DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
- sizeof(void *), nullptr);
+ sizeof(void *), AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
Device.ShadowMtx.unlock();
@@ -753,8 +753,8 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
void *ArgsBase,
__tgt_target_non_contig *NonContig,
uint64_t Size, int64_t ArgType,
- int CurrentDim, int DimSize,
- uint64_t Offset) {
+ int CurrentDim, int DimSize, uint64_t Offset,
+ AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_IDENT(loc);
int Ret = OFFLOAD_SUCCESS;
if (CurrentDim < DimSize) {
@@ -766,7 +766,7 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
if (CurrentDim != DimSize - 1 || I == 0) {
Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
ArgType, CurrentDim + 1, DimSize,
- Offset + CurOffset);
+ Offset + CurOffset, AsyncInfo);
// Stop the whole process if any contiguous piece returns anything
// other than OFFLOAD_SUCCESS.
if (Ret != OFFLOAD_SUCCESS)
@@ -778,7 +778,8 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
" len %" PRIu64 "\n",
DPxPTR(Ptr), Offset, Size);
- Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType);
+ Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
+ AsyncInfo);
}
return Ret;
}
@@ -794,12 +795,10 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
}
/// Internal function to pass data to/from the target.
-// AsyncInfo is currently unused, added here so targetDataUpdate has the
-// same signature as targetDataBegin and targetDataEnd.
int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
- void **ArgMappers, AsyncInfoTy *AsyncInfo) {
+ void **ArgMappers, AsyncInfoTy &AsyncInfo) {
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@@ -814,7 +813,7 @@ int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
- ArgTypes[I], ArgName, ArgMappers[I],
+ ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
targetDataUpdate);
if (Ret != OFFLOAD_SUCCESS) {
@@ -837,10 +836,10 @@ int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
Ret = targetDataNonContiguous(
loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
- /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
+ /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
} else {
Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
- ArgTypes[I]);
+ ArgTypes[I], AsyncInfo);
}
if (Ret == OFFLOAD_FAIL)
return OFFLOAD_FAIL;
@@ -950,7 +949,7 @@ class PrivateArgumentManagerTy {
/// A reference to the \p DeviceTy object
DeviceTy &Device;
/// A pointer to a \p AsyncInfoTy object
- AsyncInfoTy *AsyncInfo;
+ AsyncInfoTy &AsyncInfo;
// TODO: What would be the best value here? Should we make it configurable?
// If the size is larger than this threshold, we will allocate and transfer it
@@ -959,7 +958,7 @@ class PrivateArgumentManagerTy {
public:
/// Constructor
- PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy *AsyncInfo)
+ PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
: Device(Dev), AsyncInfo(AsyncInfo) {}
/// Add a private argument
@@ -986,7 +985,7 @@ class PrivateArgumentManagerTy {
#endif
// If first-private, copy data from host
if (IsFirstPrivate) {
- int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, *AsyncInfo);
+ int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
DP("Copying data to device failed, failed.\n");
return OFFLOAD_FAIL;
@@ -1042,7 +1041,7 @@ class PrivateArgumentManagerTy {
FirstPrivateArgSize, DPxPTR(TgtPtr));
// Transfer data to target device
int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
- FirstPrivateArgSize, *AsyncInfo);
+ FirstPrivateArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
DP("Failed to submit data of private arguments.\n");
return OFFLOAD_FAIL;
@@ -1090,7 +1089,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
std::vector<void *> &TgtArgs,
std::vector<ptr
diff _t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
- AsyncInfoTy *AsyncInfo) {
+ AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
DeviceTy &Device = PM->Devices[DeviceId];
int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
@@ -1141,7 +1140,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
- sizeof(void *), *AsyncInfo);
+ sizeof(void *), AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@@ -1211,7 +1210,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
int64_t *ArgSizes, int64_t *ArgTypes,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
- AsyncInfoTy *AsyncInfo) {
+ AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
DeviceTy &Device = PM->Devices[DeviceId];
@@ -1243,7 +1242,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
- int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy *AsyncInfo) {
+ int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
int32_t DeviceId = Device.DeviceID;
TableMap *TM = getTableMap(HostPtr);
@@ -1264,12 +1263,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
}
assert(TargetTable && "Global data has not been mapped\n");
- // TODO: This will go away as soon as we consequently pass in async info
- // objects (as references).
- AsyncInfoTy InternalAsyncInfo(Device);
- if (!AsyncInfo)
- AsyncInfo = &InternalAsyncInfo;
-
std::vector<void *> TgtArgs;
std::vector<ptr
diff _t> TgtOffsets;
@@ -1301,10 +1294,10 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
if (IsTeamConstruct)
Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
TgtArgs.size(), TeamNum, ThreadLimit,
- LoopTripCount, *AsyncInfo);
+ LoopTripCount, AsyncInfo);
else
Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
- TgtArgs.size(), *AsyncInfo);
+ TgtArgs.size(), AsyncInfo);
}
if (Ret != OFFLOAD_SUCCESS) {
@@ -1322,13 +1315,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
REPORT("Failed to process data after launching the kernel.\n");
return OFFLOAD_FAIL;
}
- } else {
- // TODO: We should not synchronize here but on the outer level once we pass
- // in a reference AsyncInfo object.
- // If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't
- // hava any argument, and the device supports async operations, so we need a
- // sync at this point.
- return AsyncInfo->synchronize();
}
return OFFLOAD_SUCCESS;
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 746eea272d52..fc6997a2d977 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -23,23 +23,23 @@
extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
void **args_base, void **args, int64_t *arg_sizes,
int64_t *arg_types, map_var_info_t *arg_names,
- void **arg_mappers, AsyncInfoTy *AsyncInfo);
+ void **arg_mappers, AsyncInfoTy &AsyncInfo);
extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *arg_names,
- void **ArgMappers, AsyncInfoTy *AsyncInfo);
+ void **ArgMappers, AsyncInfoTy &AsyncInfo);
extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num,
void **args_base, void **args, int64_t *arg_sizes,
int64_t *arg_types, map_var_info_t *arg_names,
- void **arg_mappers, AsyncInfoTy *AsyncInfo);
+ void **arg_mappers, AsyncInfoTy &AsyncInfo);
extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *arg_names,
void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit,
- int IsTeamConstruct, AsyncInfoTy *AsyncInfo);
+ int IsTeamConstruct, AsyncInfoTy &AsyncInfo);
extern int CheckDeviceAndCtors(int64_t device_id);
@@ -76,7 +76,7 @@ typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
// targetDataEnd and targetDataUpdate).
typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
void **, int64_t *, int64_t *,
- map_var_info_t *, void **, AsyncInfoTy *);
+ map_var_info_t *, void **, AsyncInfoTy &);
// Implemented in libomp, they are called from within __tgt_* functions.
#ifdef __cplusplus
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index efbac2c98629..60f3d4983069 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -400,16 +400,20 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
DeviceTy &Device = PM->Devices[FoundRTL->Idx + i];
Device.PendingGlobalsMtx.lock();
if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
+ AsyncInfoTy AsyncInfo(Device);
for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
- int rc =
- target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
- nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
+ int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
+ nullptr, nullptr, nullptr, 1, 1, true /*team*/,
+ AsyncInfo);
if (rc != OFFLOAD_SUCCESS) {
DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
}
}
// Remove this library's entry from PendingCtorsDtors
Device.PendingCtorsDtors.erase(desc);
+ // All constructors have been issued, wait for them now.
+ if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
+ DP("Failed synchronizing destructors kernels.\n");
}
Device.PendingGlobalsMtx.unlock();
}
More information about the Openmp-commits
mailing list