[llvm-branch-commits] [llvm] [offload] Add new features to libompaccsupport for OpenACC (PR #198102)
Ivan R. Ivanov via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat May 16 08:13:26 PDT 2026
https://github.com/ivanradanov created https://github.com/llvm/llvm-project/pull/198102
AsyncInfoTy STATIC_NON_BLOCKING type.
Strided array copies and mapping.
No create mapping type.
Refactoring intialization.
Loading offload objects with OpenACC offloading kind.
---
<sub>Stack created with <a href="https://github.com/github/gh-stack">GitHub Stacks CLI</a> • <a href="https://gh.io/stacks-feedback">Give Feedback 💬</a></sub>
>From 0e9c3cc14186d97727b599dd48e2bf04ee37146e Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <iivanov at nvidia.com>
Date: Sat, 16 May 2026 07:04:01 -0700
Subject: [PATCH] [offload] Add new features to libompaccsupport for OpenACC
AsyncInfoTy STATIC_NON_BLOCKING type.
Strided array copies and mapping.
No create mapping type.
Refactoring intialization.
Loading offload objects with OpenACC offloading kind.
---
offload/include/OpenMP/Mapping.h | 13 +-
offload/include/PluginManager.h | 19 +-
offload/include/Shared/Debug.h | 47 ++++-
offload/include/device.h | 23 +++
offload/include/omptarget.h | 61 ++++++-
offload/include/rtl.h | 4 +
offload/libompaccsupport/Mapping.cpp | 72 +++++++-
offload/libompaccsupport/OffloadRTL.cpp | 32 ++--
offload/libompaccsupport/PluginManager.cpp | 155 ++++++++++++----
offload/libompaccsupport/device.cpp | 44 ++++-
offload/libompaccsupport/exports | 4 +
offload/libompaccsupport/interface.cpp | 27 +++
offload/libomptarget/exports | 5 +-
offload/libomptarget/interface.cpp | 84 +++++----
offload/libomptarget/omptarget.cpp | 67 +++----
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 14 ++
.../common/include/PluginInterface.h | 45 +++++
.../common/src/PluginInterface.cpp | 171 ++++++++++++++++++
offload/plugins-nextgen/cuda/src/rtl.cpp | 13 ++
offload/plugins-nextgen/host/src/rtl.cpp | 7 +
20 files changed, 733 insertions(+), 174 deletions(-)
create mode 100644 offload/libompaccsupport/interface.cpp
diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h
index e4024abf26690..ce1301474082a 100644
--- a/offload/include/OpenMP/Mapping.h
+++ b/offload/include/OpenMP/Mapping.h
@@ -667,9 +667,11 @@ struct MappingInfoTy {
/// - Data transfer issue fails.
TargetPointerResultTy getTargetPointer(
HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase,
- int64_t TgtPadding, int64_t Size, map_var_info_t HstPtrName,
- bool HasFlagTo, bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount,
- bool HasCloseModifier, bool HasPresentModifier, bool HasHoldModifier,
+ int64_t TgtPadding,
+ std::variant<int64_t, const NonContigDescTy *> MemInfo,
+ map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways,
+ bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier,
+ bool HasPresentModifier, bool HasHoldModifier, bool IsNoCreate,
AsyncInfoTy &AsyncInfo, HostDataToTargetTy *OwnedTPR = nullptr,
bool ReleaseHDTTMap = true, StateInfoTy *StateInfo = nullptr);
@@ -712,6 +714,11 @@ struct MappingInfoTy {
int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
int disassociatePtr(void *HstPtrBegin);
+ void printNonContigCopyInfo(void *TgtPtrBegin, void *HstPtrBegin,
+ const NonContigDescTy &CopyInfo, bool H2D,
+ HostDataToTargetTy *Entry,
+ MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr);
+
/// Print information about the transfer from \p HstPtr to \p TgtPtr (or vice
/// versa if \p H2D is false). If there is an existing mapping, or if \p Entry
/// is set, the associated metadata will be printed as well.
diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h
index 6c6fdebe76dff..4dd9fcd3de733 100644
--- a/offload/include/PluginManager.h
+++ b/offload/include/PluginManager.h
@@ -50,7 +50,7 @@ struct PluginManager {
PluginManager() {}
- void init();
+ void initPlugins();
void deinit();
@@ -96,18 +96,19 @@ struct PluginManager {
// Work around for plugins that call dlopen on shared libraries that call
// tgt_register_lib during their initialisation. Stash the pointers in a
// vector until the plugins are all initialised and then register them.
- bool delayRegisterLib(__tgt_bin_desc *Desc) {
+ bool delayRegisterLib(std::function<void(__tgt_bin_desc *)> RegisterFunc,
+ __tgt_bin_desc *Desc) {
if (RTLsLoaded)
return false;
- DelayedBinDesc.push_back(Desc);
+ DelayedBinDesc.push_back({RegisterFunc, Desc});
return true;
}
void registerDelayedLibraries() {
// Only called by libomptarget constructor
RTLsLoaded = true;
- for (auto *Desc : DelayedBinDesc)
- __tgt_register_lib(Desc);
+ for (auto &[RegisterFunc, Desc] : DelayedBinDesc)
+ RegisterFunc(Desc);
DelayedBinDesc.clear();
}
@@ -152,7 +153,9 @@ struct PluginManager {
private:
bool RTLsLoaded = false;
- llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc;
+ llvm::SmallVector<
+ std::pair<std::function<void(__tgt_bin_desc *)>, __tgt_bin_desc *>>
+ DelayedBinDesc;
// List of all plugins, in use or not.
llvm::SmallVector<std::unique_ptr<GenericPluginTy>> Plugins;
@@ -183,8 +186,8 @@ struct PluginManager {
__tgt_bin_desc *upgradeLegacyEntries(__tgt_bin_desc *Desc);
};
-/// Initialize the plugin manager and OpenMP runtime.
-void initRuntime();
+/// Initialize the plugin manager.
+void initRuntime(bool OffloadEnabled);
/// Deinitialize the plugin and delete it.
void deinitRuntime();
diff --git a/offload/include/Shared/Debug.h b/offload/include/Shared/Debug.h
index 34f09051f41ba..3f60604e3f79e 100644
--- a/offload/include/Shared/Debug.h
+++ b/offload/include/Shared/Debug.h
@@ -172,6 +172,7 @@ class LLVM_ABI odbg_ostream final : public raw_ostream {
uint32_t BaseLevel;
bool ShouldPrefixNextString;
bool ShouldEmitNewLineOnDestruction;
+ bool ShouldAbortOnDestruction;
bool NeedEndNewLine = false;
/// Buffer to reduce interference between different threads
@@ -223,17 +224,22 @@ class LLVM_ABI odbg_ostream final : public raw_ostream {
public:
explicit odbg_ostream(std::string Prefix, raw_ostream &Os, uint32_t BaseLevel,
bool ShouldPrefixNextString = true,
- bool ShouldEmitNewLineOnDestruction = true)
+ bool ShouldEmitNewLineOnDestruction = true,
+ bool ShouldAbortOnDestruction = false)
: Prefix(std::move(Prefix)), Os(Os), BaseLevel(BaseLevel),
ShouldPrefixNextString(ShouldPrefixNextString),
ShouldEmitNewLineOnDestruction(ShouldEmitNewLineOnDestruction),
- BufferStrm(Buffer) {
+ ShouldAbortOnDestruction(ShouldAbortOnDestruction), BufferStrm(Buffer) {
SetUnbuffered();
}
~odbg_ostream() final {
if (ShouldEmitNewLineOnDestruction && NeedEndNewLine)
BufferStrm << '\n';
Os << BufferStrm.str();
+ if (ShouldAbortOnDestruction) {
+ Os.flush();
+ abort();
+ }
}
odbg_ostream(const odbg_ostream &) = delete;
odbg_ostream &operator=(const odbg_ostream &) = delete;
@@ -242,6 +248,7 @@ class LLVM_ABI odbg_ostream final : public raw_ostream {
BaseLevel = other.BaseLevel;
ShouldPrefixNextString = other.ShouldPrefixNextString;
ShouldEmitNewLineOnDestruction = other.ShouldEmitNewLineOnDestruction;
+ ShouldAbortOnDestruction = other.ShouldAbortOnDestruction;
NeedEndNewLine = other.NeedEndNewLine;
Muted = other.Muted;
BufferStrm << other.BufferStrm.str();
@@ -611,7 +618,8 @@ constexpr const char *ODT_Tool = OLDT_Tool;
constexpr const char *ODT_Module = OLDT_Module;
constexpr const char *ODT_Interop = "Interop";
-static inline odbg_ostream reportErrorStream() {
+static inline odbg_ostream reportErrorStream(bool ShouldAbort,
+ std::string Prefix) {
#ifdef OMPTARGET_DEBUG
if (::llvm::offload::debug::isDebugEnabled()) {
uint32_t RealLevel = ODL_Error;
@@ -619,13 +627,26 @@ static inline odbg_ostream reportErrorStream() {
(ODT_Error), RealLevel))
return odbg_ostream{
::llvm::offload::debug::computePrefix(DEBUG_PREFIX, ODT_Error),
- ::llvm::offload::debug::dbgs(), RealLevel};
+ ::llvm::offload::debug::dbgs(),
+ RealLevel,
+ /*ShouldPrefixNextString=*/true,
+ /*ShouldEmitNewLineOnDestruction=*/true,
+ ShouldAbort};
else
- return odbg_ostream{"", ::llvm::nulls(), 1};
+ return odbg_ostream{"",
+ ::llvm::nulls(),
+ 1,
+ /*ShouldPrefixNextString=*/true,
+ /*ShouldEmitNewLineOnDestruction=*/true,
+ ShouldAbort};
}
#endif
- return odbg_ostream{GETNAME(TARGET_NAME) " error: ",
- ::llvm::offload::debug::dbgs(), ODL_Error};
+ return odbg_ostream{GETNAME(TARGET_NAME) + Prefix,
+ ::llvm::offload::debug::dbgs(),
+ ODL_Error,
+ /*ShouldPrefixNextString=*/true,
+ /*ShouldEmitNewLineOnDestruction=*/true,
+ ShouldAbort};
}
#ifdef OMPTARGET_DEBUG
@@ -693,8 +714,18 @@ static inline raw_ostream &operator<<(raw_ostream &Os, void *Ptr) {
#endif // OMPTARGET_DEBUG
+// New REPORT warning macro in the same style as ODBG
+#define REPORT_WARN() \
+ ::llvm::omp::target::debug::reportErrorStream(/*ShouldAbort=*/false, \
+ " warning: ")
+// New REPORT error macro in the same style as ODBG
+#define REPORT() \
+ ::llvm::omp::target::debug::reportErrorStream(/*ShouldAbort=*/false, \
+ " error: ")
// New REPORT macro in the same style as ODBG
-#define REPORT() ::llvm::omp::target::debug::reportErrorStream()
+#define REPORT_FATAL() \
+ ::llvm::omp::target::debug::reportErrorStream(/*ShouldAbort=*/true, \
+ " fatal error: ")
} // namespace llvm::omp::target::debug
diff --git a/offload/include/device.h b/offload/include/device.h
index af103c316c3cf..c9afae32b3a1e 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -81,6 +81,12 @@ struct DeviceTy {
/// allocator should be used (host, shared, device).
int32_t deleteData(void *TgtPtrBegin, int32_t Kind = TARGET_ALLOC_DEFAULT);
+ int32_t
+ submitNonContigData(void *TgtPtrBegin, void *HstPtrBegin,
+ const NonContigDescTy &CopyInfo, AsyncInfoTy &AsyncInfo,
+ HostDataToTargetTy *Entry = nullptr,
+ MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr = nullptr);
+
// Data transfer. When AsyncInfo is nullptr, the transfer will be
// synchronous.
// Copy data from host to device
@@ -89,6 +95,12 @@ struct DeviceTy {
HostDataToTargetTy *Entry = nullptr,
MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr = nullptr);
+ int32_t
+ retrieveNonContigData(void *HstPtrBegin, void *TgtPtrBegin,
+ const NonContigDescTy &CopyInfo, AsyncInfoTy &AsyncInfo,
+ HostDataToTargetTy *Entry = nullptr,
+ MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr = nullptr);
+
// Copy data from device back to host
int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
AsyncInfoTy &AsyncInfo,
@@ -120,16 +132,27 @@ struct DeviceTy {
KernelExtraArgsTy *KernelExtraArgs,
AsyncInfoTy &AsyncInfo);
+ // Enqueues a host function in the asynchronous queue.
+ int32_t enqueueHostCall(void (*Callback)(void *), void *UserData,
+ AsyncInfoTy &AsyncInfo);
+
/// Synchronize device/queue/event based on \p AsyncInfo and return
/// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
int32_t synchronize(AsyncInfoTy &AsyncInfo);
+ /// Synchronize device/queue/event based on \p AsyncInfo without releasing the
+ /// queue and return QueueStatusTy::READY / QueueStatusTy::NOT_READY /
+ /// OFFLOAD_FAIL.
+ int32_t synchronizeStatic(AsyncInfoTy &AsyncInfo);
+
/// Query for device/queue/event based completion on \p AsyncInfo in a
/// non-blocking manner and return OFFLOAD_SUCCESS/OFFLOAD_FAIL when
/// succeeds/fails. Must be called multiple times until AsyncInfo is
/// completed and AsyncInfo.isDone() returns true.
int32_t queryAsync(AsyncInfoTy &AsyncInfo);
+ int32_t queryAsyncStatic(AsyncInfoTy &AsyncInfo);
+
/// Calls the corresponding print device info function in the plugin.
bool printDeviceInfo();
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index e5d9852ad48a6..57c9c4ddcffcf 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -20,11 +20,15 @@
#include "OpenMP/InternalTypes.h"
+#include <atomic>
#include <cstddef>
#include <cstdint>
#include <deque>
#include <functional>
+#include <map>
+#include <mutex>
#include <type_traits>
+#include <variant>
#include "llvm/ADT/SmallVector.h"
@@ -117,7 +121,8 @@ struct DeviceTy;
/// mistakes.
class AsyncInfoTy {
public:
- enum class SyncTy { BLOCKING, NON_BLOCKING };
+ enum class SyncTy { BLOCKING, NON_BLOCKING, STATIC_NON_BLOCKING };
+ using PostProcFuncTy = std::function<int()>;
private:
/// Locations we used in (potentially) asynchronous calls which should live
@@ -127,8 +132,8 @@ class AsyncInfoTy {
/// Post-processing operations executed after a successful synchronization.
/// \note the post-processing function should return OFFLOAD_SUCCESS or
/// OFFLOAD_FAIL appropriately.
- using PostProcFuncTy = std::function<int()>;
llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions;
+ std::mutex PostProcessingFunctionsMutex;
__tgt_async_info AsyncInfo;
DeviceTy &Device;
@@ -139,30 +144,43 @@ class AsyncInfoTy {
AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
: Device(Device), SyncType(SyncType) {}
- ~AsyncInfoTy() { synchronize(); }
+ ~AsyncInfoTy() { finalize(); }
/// Implicit conversion to the __tgt_async_info which is used in the
/// plugin interface.
operator __tgt_async_info *() { return &AsyncInfo; }
- /// Synchronize all pending actions.
+ /// Finalizes this instance of AsyncInfoTy.
///
- /// \note synchronization will be performance in a blocking or non-blocking
- /// manner, depending on the SyncType.
+ /// \note synchronization will be performed only if SyncType is blocking.
///
- /// \note if the operations are completed, the registered post-processing
- /// functions will be executed once and unregistered afterwards.
+ /// \note in all SyncType cases, if the operations are completed, the
+ /// registered post-processing functions will be executed once and
+ /// unregistered afterwards.
///
/// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
+ int finalize();
+
+ /// Synchronize all pending actions.
+ ///
+ /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS depending on whether an error was
+ /// encountered.
int synchronize();
+ /// Queries whether all pending actions are done. This function does not
+ /// return the queue to the RTL.
+ ///
+ /// \returns OFFLOAD_FAIL on error, 0 when the actions are done, and 1 when
+ /// they are pending.
+ int query();
+
/// Return a void* reference with a lifetime that is at least as long as this
/// AsyncInfoTy object. The location can be used as intermediate buffer.
void *&getVoidPtrLocation();
/// Check if all asynchronous operations are completed.
///
- /// \note only a lightweight check. If needed, use synchronize() to query the
+ /// \note only a lightweight check. If needed, use finalize() to query the
/// status of AsyncInfo before checking.
///
/// \returns true if there is no pending asynchronous operations, false
@@ -178,6 +196,7 @@ class AsyncInfoTy {
static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>,
"Invalid post-processing function type. Please check "
"function signature!");
+ std::lock_guard<std::mutex> PPFGuard{PostProcessingFunctionsMutex};
PostProcessingFunctions.emplace_back(Function);
}
@@ -270,6 +289,30 @@ struct __tgt_target_non_contig {
uint64_t Stride;
};
+struct NonContigDescTy {
+ llvm::SmallVector<__tgt_target_non_contig, 6> Dims;
+
+ const __tgt_target_non_contig &getDim(unsigned I) { return Dims[I]; }
+ unsigned getRank() const { return Dims.size(); }
+
+ void mergeContiguousDims() {
+ int RemovedDim = 0;
+ for (int I = getRank() - 1; I > 0; --I) {
+ if (Dims[I].Count * Dims[I].Stride == Dims[I - 1].Stride)
+ RemovedDim++;
+ }
+ Dims.resize(getRank() - RemovedDim);
+ }
+
+ uint64_t getLastDimCopySize() const {
+ return Dims.back().Count * Dims.back().Stride;
+ }
+
+ uint64_t getAllocSize() const {
+ return (Dims[0].Count + Dims[0].Offset) * Dims[0].Stride;
+ }
+};
+
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/offload/include/rtl.h b/offload/include/rtl.h
index 38f1dd24011e0..0320f6d15a458 100644
--- a/offload/include/rtl.h
+++ b/offload/include/rtl.h
@@ -53,4 +53,8 @@ struct TableMap {
};
typedef std::map<void *, TableMap> HostPtrToTableMapTy;
+namespace llvm::offload {
+TableMap *getTableMap(void *HostPtr);
+}
+
#endif
diff --git a/offload/libompaccsupport/Mapping.cpp b/offload/libompaccsupport/Mapping.cpp
index 1bb2e424bd083..6ca53b042b164 100644
--- a/offload/libompaccsupport/Mapping.cpp
+++ b/offload/libompaccsupport/Mapping.cpp
@@ -14,6 +14,10 @@
#include "Shared/Debug.h"
#include "Shared/Requirements.h"
#include "device.h"
+#include "omptarget.h"
+#include <cstdint>
+#include <optional>
+#include <variant>
using namespace llvm::omp::target::debug;
@@ -206,12 +210,22 @@ LookupResult MappingInfoTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
TargetPointerResultTy MappingInfoTy::getTargetPointer(
HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, void *HstPtrBase,
- int64_t TgtPadding, int64_t Size, map_var_info_t HstPtrName, bool HasFlagTo,
- bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount,
- bool HasCloseModifier, bool HasPresentModifier, bool HasHoldModifier,
+ int64_t TgtPadding, std::variant<int64_t, const NonContigDescTy *> MemInfo,
+ map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways,
+ bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier,
+ bool HasPresentModifier, bool HasHoldModifier, bool IsNoCreate,
AsyncInfoTy &AsyncInfo, HostDataToTargetTy *OwnedTPR, bool ReleaseHDTTMap,
StateInfoTy *StateInfo) {
+ int64_t Size;
+ const NonContigDescTy *CopyInfo = nullptr;
+ if (std::holds_alternative<int64_t>(MemInfo)) {
+ Size = std::get<int64_t>(MemInfo);
+ } else {
+ CopyInfo = std::get<const NonContigDescTy *>(MemInfo);
+ Size = CopyInfo->getAllocSize();
+ }
+
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size, OwnedTPR);
LR.TPR.Flags.IsPresent = true;
@@ -250,7 +264,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction,
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
LR.TPR.TargetPointer = (void *)Ptr;
- } else if ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && !IsImplicit) {
+ } else if ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && !IsImplicit &&
+ !IsNoCreate) {
// Explicit extension of mapped data - not allowed.
MESSAGE("explicit extension not allowed: host address specified is " DPxMOD
" (%" PRId64
@@ -290,7 +305,7 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
MESSAGE("device mapping required by 'present' map type modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size);
- } else if (Size) {
+ } else if (Size && !IsNoCreate) {
// If it is not contained and Size > 0, we should create a new entry for it.
LR.TPR.Flags.IsNewEntry = true;
uintptr_t TgtAllocBegin =
@@ -329,6 +344,12 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
if (ReleaseHDTTMap)
HDTTMap.destroy();
+ if (!LR.TPR.isPresent() && IsNoCreate) {
+ ODBG() << "Mapping for " << HstPtrBegin << " with size " << Size
+ << " does not exist and no_create is specified: returning.";
+ return std::move(LR.TPR);
+ }
+
// Lambda to check if this pointer was newly allocated on the current region.
// This is needed to handle cases when the TO entry is encountered after an
// alloc entry for the same pointer. In such cases, the ref-count is already
@@ -384,8 +405,14 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
ODBG(ODT_Mapping) << "Moving " << Size << " bytes (hst:" << HstPtrBegin
<< ") -> (tgt:" << LR.TPR.TargetPointer << ")";
- int Ret = Device.submitData(LR.TPR.TargetPointer, HstPtrBegin, Size,
- AsyncInfo, LR.TPR.getEntry());
+ int Ret;
+ if (CopyInfo) {
+ Ret = Device.submitNonContigData(LR.TPR.TargetPointer, HstPtrBegin,
+ *CopyInfo, AsyncInfo, LR.TPR.getEntry());
+ } else {
+ Ret = Device.submitData(LR.TPR.TargetPointer, HstPtrBegin, Size,
+ AsyncInfo, LR.TPR.getEntry());
+ }
if (Ret != OFFLOAD_SUCCESS) {
REPORT() << "Copying data to device failed.";
// We will also return nullptr if the data movement fails because that
@@ -555,6 +582,27 @@ int MappingInfoTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry,
return Ret;
}
+static void printNonContigCopyInfoImpl(int DeviceId, bool H2D,
+ void *SrcPtrBegin, void *DstPtrBegin,
+ const NonContigDescTy &CopyInfo,
+ HostDataToTargetTy *HT) {
+
+ INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceId,
+ "Copying non-contiguous data from %s to %s, %sPtr=" DPxMOD
+ ", %sPtr=" DPxMOD ", Name=%s\n",
+ H2D ? "host" : "device", H2D ? "device" : "host", H2D ? "Hst" : "Tgt",
+ DPxPTR(H2D ? SrcPtrBegin : DstPtrBegin), H2D ? "Tgt" : "Hst",
+ DPxPTR(H2D ? DstPtrBegin : SrcPtrBegin),
+ (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
+ : "unknown");
+ ODBG(ODT_Mapping) << "Non-contiguous data descriptor:\n";
+ for (unsigned I = 0; I < CopyInfo.getRank(); I++)
+ ODBG(ODT_Mapping) << " Dim " << I << " : Offset "
+ << CopyInfo.Dims[I].Offset << " Count "
+ << CopyInfo.Dims[I].Count << " Stride "
+ << CopyInfo.Dims[I].Stride << "\n";
+}
+
static void printCopyInfoImpl(int DeviceId, bool H2D, void *SrcPtrBegin,
void *DstPtrBegin, int64_t Size,
HostDataToTargetTy *HT) {
@@ -569,6 +617,16 @@ static void printCopyInfoImpl(int DeviceId, bool H2D, void *SrcPtrBegin,
: "unknown");
}
+void MappingInfoTy::printNonContigCopyInfo(
+ void *TgtPtrBegin, void *HstPtrBegin, const NonContigDescTy &CopyInfo,
+ bool H2D, HostDataToTargetTy *Entry,
+ MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
+ auto HDTTMap =
+ HostDataToTargetMap.getExclusiveAccessor(!!Entry || !!HDTTMapPtr);
+ printNonContigCopyInfoImpl(Device.DeviceID, H2D, HstPtrBegin, TgtPtrBegin,
+ CopyInfo, Entry);
+}
+
void MappingInfoTy::printCopyInfo(
void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, bool H2D,
HostDataToTargetTy *Entry, MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
diff --git a/offload/libompaccsupport/OffloadRTL.cpp b/offload/libompaccsupport/OffloadRTL.cpp
index 9b02376609cee..972138a8814ad 100644
--- a/offload/libompaccsupport/OffloadRTL.cpp
+++ b/offload/libompaccsupport/OffloadRTL.cpp
@@ -23,25 +23,15 @@ using namespace llvm::omp::target::debug;
static std::mutex PluginMtx;
static uint32_t RefCount = 0;
+static bool PluginsInitialized = 0;
std::atomic<bool> RTLAlive{false};
std::atomic<int> RTLOngoingSyncs{0};
-/// Check deleted and deprecated features, such as environment variables.
-static void checkRuntimeEnvironment() {
- const char *ShmemEnvarName = "LIBOMPTARGET_SHARED_MEMORY_SIZE";
- if (std::getenv(ShmemEnvarName))
- MESSAGE("Warning: %s is no longer valid. Please use OpenMP clause "
- "'dyn_groupprivate' instead.\n",
- ShmemEnvarName);
-}
-
-void initRuntime() {
+void initRuntime(bool OffloadEnabled) {
std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
Profiler::get();
TIMESCOPE();
- checkRuntimeEnvironment();
-
if (PM == nullptr)
PM = new PluginManager();
@@ -53,17 +43,27 @@ void initRuntime() {
llvm::omp::target::ompt::connectLibrary();
#endif
- PM->init();
- PM->registerDelayedLibraries();
+ if (!OffloadEnabled)
+ ODBG(ODT_Init) << "Offload is disabled. Skipping plugin initialization";
// RTL initialization is complete
RTLAlive = true;
}
+
+ // Initialize the plugins if at least one of the calls to this function is
+ // with OffloadEnabled == true
+ if (!PluginsInitialized && OffloadEnabled) {
+ ODBG(ODT_Init) << "Offload is enabled. Initializating plugins";
+ PM->initPlugins();
+ PM->registerDelayedLibraries();
+ PluginsInitialized = true;
+ }
}
void deinitRuntime() {
std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
assert(PM && "Runtime not initialized");
+ assert(RefCount != 0 && "Unmatched init and deinit");
if (RefCount == 1) {
ODBG(ODT_Deinit) << "Deinit offload library!";
@@ -74,10 +74,12 @@ void deinitRuntime() {
<< RTLOngoingSyncs.load();
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
+
PM->deinit();
delete PM;
PM = nullptr;
- }
+ PluginsInitialized = false;
+ }
RefCount--;
}
diff --git a/offload/libompaccsupport/PluginManager.cpp b/offload/libompaccsupport/PluginManager.cpp
index 3241a8ecc764f..3f22f3fc931b7 100644
--- a/offload/libompaccsupport/PluginManager.cpp
+++ b/offload/libompaccsupport/PluginManager.cpp
@@ -15,6 +15,7 @@
#include "Shared/Debug.h"
#include "Shared/Profile.h"
#include "device.h"
+#include "omptarget.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
@@ -30,7 +31,7 @@ PluginManager *PM = nullptr;
#define PLUGIN_TARGET(Name) extern "C" GenericPluginTy *createPlugin_##Name();
#include "Shared/Targets.def"
-int AsyncInfoTy::synchronize() {
+int AsyncInfoTy::finalize() {
int Result = OFFLOAD_SUCCESS;
if (!isQueueEmpty()) {
switch (SyncType) {
@@ -44,6 +45,9 @@ int AsyncInfoTy::synchronize() {
case SyncTy::NON_BLOCKING:
Result = Device.queryAsync(*this);
break;
+ case SyncTy::STATIC_NON_BLOCKING:
+ Result = Device.queryAsyncStatic(*this);
+ break;
}
}
@@ -54,6 +58,67 @@ int AsyncInfoTy::synchronize() {
return Result;
}
+static int32_t processPPFs(SmallVector<AsyncInfoTy::PostProcFuncTy> &PPFs) {
+ for (size_t I = 0; I < PPFs.size(); ++I)
+ if (int Res = PPFs[I](); Res != OFFLOAD_SUCCESS)
+ return Res;
+ return OFFLOAD_SUCCESS;
+}
+
+int AsyncInfoTy::synchronize() {
+ assert(SyncType == SyncTy::STATIC_NON_BLOCKING);
+
+ // We still have not created a queue, or this specific device does not
+ // generate queues.
+ if (isQueueEmpty())
+ return OFFLOAD_SUCCESS;
+
+ int Result = OFFLOAD_SUCCESS;
+ switch (SyncType) {
+ case SyncTy::BLOCKING:
+ case SyncTy::NON_BLOCKING: {
+ // BLOCKING and NON_BLOCKING types return the queue to the RTL after
+ // synchronization.
+ Result = Device.synchronize(*this);
+ assert(AsyncInfo.Queue == nullptr &&
+ "The device plugin should have nulled the queue to indicate there "
+ "are no outstanding actions!");
+ // Run any pending post-processing function registered on this async object.
+ if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
+ Result = runPostProcessing();
+ return Result;
+ }
+ case SyncTy::STATIC_NON_BLOCKING: {
+ // STATIC_NON_BLOCKING retains its queue thus more careful handling of the
+ // post processing functions is required.
+
+ // Collect the enqueued PPFs until this point.
+ SmallVector<PostProcFuncTy> LocalPPFs;
+ {
+ std::lock_guard<std::mutex> PPFGuard{PostProcessingFunctionsMutex};
+ std::swap(LocalPPFs, PostProcessingFunctions);
+ }
+ Result = Device.synchronizeStatic(*this);
+ // Run any pending post-processing function collected _before_ we
+ // synchronize. This is important as between before the synchronization and
+ // after we could have enqueued more post processing operations, which we
+ // must not run yet.
+ if (Result == OFFLOAD_SUCCESS)
+ Result = processPPFs(LocalPPFs);
+
+ return Result;
+ }
+ }
+ llvm_unreachable("Unexpected SyncType");
+}
+
+int AsyncInfoTy::query() {
+ // If we don't have a queue, there are no pending actions.
+ if (isQueueEmpty())
+ return 0;
+ return Device.queryAsyncStatic(*this);
+}
+
void *&AsyncInfoTy::getVoidPtrLocation() {
BufferLocations.push_back(nullptr);
return BufferLocations.back();
@@ -62,30 +127,26 @@ void *&AsyncInfoTy::getVoidPtrLocation() {
bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
int32_t AsyncInfoTy::runPostProcessing() {
- size_t Size = PostProcessingFunctions.size();
- for (size_t I = 0; I < Size; ++I) {
- const int Result = PostProcessingFunctions[I]();
+ // Post-processing procedures might add new procedures themselves, so
+ // repeatedly process them until we are done.
+ while (true) {
+ SmallVector<PostProcFuncTy> LocalPPFs;
+ {
+ std::lock_guard<std::mutex> PPFGuard{PostProcessingFunctionsMutex};
+ std::swap(LocalPPFs, PostProcessingFunctions);
+ }
+ if (LocalPPFs.size() == 0)
+ return OFFLOAD_SUCCESS;
+ int32_t Result = processPPFs(LocalPPFs);
if (Result != OFFLOAD_SUCCESS)
return Result;
}
-
- // Clear the vector up until the last known function, since post-processing
- // procedures might add new procedures themselves.
- const auto *PrevBegin = PostProcessingFunctions.begin();
- PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size);
-
- return OFFLOAD_SUCCESS;
}
bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
-void PluginManager::init() {
+void PluginManager::initPlugins() {
TIMESCOPE();
- if (OffloadPolicy::isOffloadDisabled()) {
- ODBG(ODT_Init) << "Offload is disabled. Skipping plugin initialization";
- return;
- }
-
ODBG(ODT_Init) << "Loading RTLs";
// Attempt to create an instance of each supported plugin.
@@ -179,13 +240,6 @@ void PluginManager::initializeAllDevices() {
initializeDevice(Plugin, DeviceId);
}
}
- // After all plugins are initialized, register atExit cleanup handlers
- std::atexit([]() {
- // Interop cleanup should be done before the plugins are deinitialized as
- // the backend libraries may be already unloaded.
- if (PM)
- PM->InteropTbl.clear();
- });
}
// Returns a pointer to the binary descriptor, upgrading from a legacy format if
@@ -365,10 +419,6 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
ODBG(ODT_Init) << "Done registering entries!";
}
-// Temporary forward declaration, old style CTor/DTor handling is going away.
-int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
- KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
-
void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
ODBG(ODT_Deinit) << "Unloading target library!";
@@ -413,7 +463,8 @@ void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
PM->TblMapMtx.lock();
for (llvm::offloading::EntryTy *Cur = Desc->HostEntriesBegin;
Cur < Desc->HostEntriesEnd; ++Cur) {
- if (Cur->Kind == object::OffloadKind::OFK_OpenMP)
+ if (Cur->Kind == object::OffloadKind::OFK_OpenMP ||
+ Cur->Kind == object::OffloadKind::OFK_OpenACC)
PM->HostPtrToTableMap.erase(Cur->Address);
}
@@ -483,7 +534,8 @@ static int loadImagesOntoDevice(DeviceTy &Device) {
TransTable->TargetsEntries[DeviceId];
for (llvm::offloading::EntryTy &Entry :
llvm::make_range(Img->EntriesBegin, Img->EntriesEnd)) {
- if (Entry.Kind != object::OffloadKind::OFK_OpenMP)
+ if (Entry.Kind != object::OffloadKind::OFK_OpenMP &&
+ Entry.Kind != object::OffloadKind::OFK_OpenACC)
continue;
__tgt_device_binary &Binary = *BinaryOrErr;
@@ -537,7 +589,8 @@ static int loadImagesOntoDevice(DeviceTy &Device) {
CurrDeviceEntry != EntryDeviceEnd;
CurrDeviceEntry++, CurrHostEntry++) {
if (CurrDeviceEntry->Size == 0 ||
- CurrDeviceEntry->Kind != object::OffloadKind::OFK_OpenMP)
+ (CurrDeviceEntry->Kind != object::OffloadKind::OFK_OpenMP &&
+ CurrDeviceEntry->Kind != object::OffloadKind::OFK_OpenACC))
continue;
assert(CurrDeviceEntry->Size == CurrHostEntry->Size &&
@@ -560,7 +613,7 @@ static int loadImagesOntoDevice(DeviceTy &Device) {
void *DevPtr;
Device.retrieveData(&DevPtr, CurrDeviceEntryAddr, sizeof(void *),
AsyncInfo, /*Entry=*/nullptr, &HDTTMap);
- if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
+ if (AsyncInfo.finalize() != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
CurrDeviceEntryAddr = DevPtr;
}
@@ -620,3 +673,41 @@ Expected<DeviceTy &> PluginManager::getDevice(uint32_t DeviceNo) {
DeviceNo);
return *DevicePtr;
}
+
+namespace llvm::offload {
+/// Find the table information in the map or look it up in the translation
+/// tables.
+TableMap *getTableMap(void *HostPtr) {
+ std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
+ HostPtrToTableMapTy::iterator TableMapIt =
+ PM->HostPtrToTableMap.find(HostPtr);
+
+ if (TableMapIt != PM->HostPtrToTableMap.end())
+ return &TableMapIt->second;
+
+ // We don't have a map. So search all the registered libraries.
+ TableMap *TM = nullptr;
+ std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
+ for (HostEntriesBeginToTransTableTy::iterator Itr =
+ PM->HostEntriesBeginToTransTable.begin();
+ Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
+ // get the translation table (which contains all the good info).
+ TranslationTable *TransTable = &Itr->second;
+ // iterate over all the host table entries to see if we can locate the
+ // host_ptr.
+ llvm::offloading::EntryTy *Cur = TransTable->HostTable.EntriesBegin;
+ for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
+ if (Cur->Address != HostPtr)
+ continue;
+ // we got a match, now fill the HostPtrToTableMap so that we
+ // may avoid this search next time.
+ TM = &(PM->HostPtrToTableMap)[HostPtr];
+ TM->Table = TransTable;
+ TM->Index = I;
+ return TM;
+ }
+ }
+
+ return nullptr;
+}
+} // namespace llvm::offload
diff --git a/offload/libompaccsupport/device.cpp b/offload/libompaccsupport/device.cpp
index 546f679353544..010d05a51ab2c 100644
--- a/offload/libompaccsupport/device.cpp
+++ b/offload/libompaccsupport/device.cpp
@@ -19,7 +19,6 @@
#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "omptarget.h"
-#include "private.h"
#include "rtl.h"
#include "Shared/EnvironmentVar.h"
@@ -40,6 +39,8 @@ using namespace llvm::omp::target::ompt;
using namespace llvm::omp::target::plugin;
using namespace llvm::omp::target::debug;
+// TODO disable OMPT if we call from OpenACC
+
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
AsyncInfoTy &AsyncInfo) const {
// First, check if the user disabled atomic map transfer/malloc/dealloc.
@@ -123,7 +124,8 @@ setupIndirectCallTable(DeviceTy &Device, __tgt_device_image *Image,
Image->EntriesEnd);
llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
for (const auto &Entry : Entries) {
- if (Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP ||
+ if ((Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP &&
+ Entry.Kind != llvm::object::OffloadKind::OFK_OpenACC) ||
Entry.Size == 0 ||
(!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT) &&
!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT_VTABLE)))
@@ -280,6 +282,18 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
AsyncInfo);
}
+int32_t
+DeviceTy::submitNonContigData(void *TgtPtrBegin, void *HstPtrBegin,
+ const NonContigDescTy &CopyInfo,
+ AsyncInfoTy &AsyncInfo, HostDataToTargetTy *Entry,
+ MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
+ if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
+ MappingInfo.printNonContigCopyInfo(TgtPtrBegin, HstPtrBegin, CopyInfo,
+ /*H2D=*/true, Entry, HDTTMapPtr);
+ return RTL->data_non_contig_submit_async(RTLDeviceID, TgtPtrBegin,
+ HstPtrBegin, CopyInfo, AsyncInfo);
+}
+
// Retrieve data from device
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
int64_t Size, AsyncInfoTy &AsyncInfo,
@@ -300,6 +314,17 @@ int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
AsyncInfo);
}
+int32_t DeviceTy::retrieveNonContigData(
+ void *HstPtrBegin, void *TgtPtrBegin, const NonContigDescTy &CopyInfo,
+ AsyncInfoTy &AsyncInfo, HostDataToTargetTy *Entry,
+ MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
+ if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
+ MappingInfo.printNonContigCopyInfo(TgtPtrBegin, HstPtrBegin, CopyInfo,
+ /*H2D=*/false, Entry, HDTTMapPtr);
+ return RTL->data_non_contig_retrieve_async(RTLDeviceID, HstPtrBegin,
+ TgtPtrBegin, CopyInfo, AsyncInfo);
+}
+
// Copy data from current device to destination device directly
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, AsyncInfoTy &AsyncInfo) {
@@ -347,7 +372,6 @@ int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
return OFFLOAD_SUCCESS;
}
-// Run region on device
int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
KernelExtraArgsTy *KernelExtraArgs,
@@ -356,7 +380,11 @@ int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
&KernelArgs, KernelExtraArgs, AsyncInfo);
}
-// Run region on device
+int32_t DeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData,
+ AsyncInfoTy &AsyncInfo) {
+ return RTL->enqueue_host_call(RTLDeviceID, Callback, UserData, AsyncInfo);
+}
+
bool DeviceTy::printDeviceInfo() {
RTL->print_device_info(RTLDeviceID);
return true;
@@ -376,10 +404,18 @@ int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
return RTL->synchronize(RTLDeviceID, AsyncInfo);
}
+int32_t DeviceTy::synchronizeStatic(AsyncInfoTy &AsyncInfo) {
+ return RTL->synchronize_static(RTLDeviceID, AsyncInfo);
+}
+
int32_t DeviceTy::queryAsync(AsyncInfoTy &AsyncInfo) {
return RTL->query_async(RTLDeviceID, AsyncInfo);
}
+int32_t DeviceTy::queryAsyncStatic(AsyncInfoTy &AsyncInfo) {
+ return RTL->query_async_static(RTLDeviceID, AsyncInfo);
+}
+
int32_t DeviceTy::createEvent(void **Event) {
return RTL->create_event(RTLDeviceID, Event);
}
diff --git a/offload/libompaccsupport/exports b/offload/libompaccsupport/exports
index b67f7cbd8890a..68ae4c0443a98 100644
--- a/offload/libompaccsupport/exports
+++ b/offload/libompaccsupport/exports
@@ -1,5 +1,9 @@
VERS1.0 {
global:
+ __tgt_rtl_init;
+ __tgt_rtl_deinit;
+ __tgt_init_all_rtls;
+ __tgt_register_rpc_callback;
*;
local:
diff --git a/offload/libompaccsupport/interface.cpp b/offload/libompaccsupport/interface.cpp
new file mode 100644
index 0000000000000..f178476d7c5ab
--- /dev/null
+++ b/offload/libompaccsupport/interface.cpp
@@ -0,0 +1,27 @@
+//===-------- interface.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PluginManager.h"
+#include "omptarget.h"
+
+EXTERN void __tgt_rtl_init() { initRuntime(/*OffloadEnabled=*/true); }
+EXTERN void __tgt_rtl_deinit() { deinitRuntime(); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Initialize all available devices without registering any image
+EXTERN void __tgt_init_all_rtls() {
+ assert(PM && "Runtime not initialized");
+ PM->initializeAllDevices();
+}
+
+EXTERN void __tgt_register_rpc_callback(unsigned (*Callback)(void *,
+ unsigned)) {
+ for (auto &Plugin : PM->plugins())
+ if (Plugin.is_initialized())
+ Plugin.getRPCServer().registerCallback(Callback);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 1831c43cc5f29..7073d25fabd55 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -1,11 +1,8 @@
VERS1.0 {
global:
- __tgt_rtl_init;
- __tgt_rtl_deinit;
__tgt_register_requires;
__tgt_register_lib;
__tgt_unregister_lib;
- __tgt_init_all_rtls;
__tgt_target_data_begin;
__tgt_target_data_end;
__tgt_target_data_update;
@@ -79,10 +76,10 @@ VERS1.0 {
__tgt_interop_use60;
__tgt_interop_release;
__tgt_target_sync;
- __tgt_register_rpc_callback;
__llvmPushCallConfiguration;
__llvmPopCallConfiguration;
llvmLaunchKernel;
+
local:
*;
};
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index 64ff078e3ec46..b1daeecac757f 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -38,6 +38,47 @@ using namespace llvm::omp::target::ompt;
#endif
using namespace llvm::omp::target::debug;
+static std::mutex InitMutex;
+static uint32_t InitRefCount = 0;
+
+/// Check deleted and deprecated features, such as environment variables.
+static void checkRuntimeEnvironment() {
+ const char *ShmemEnvarName = "LIBOMPTARGET_SHARED_MEMORY_SIZE";
+ if (std::getenv(ShmemEnvarName))
+ MESSAGE("Warning: %s is no longer valid. Please use OpenMP clause "
+ "'dyn_groupprivate' instead.\n",
+ ShmemEnvarName);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// adds a target shared library to the target execution image
+EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
+ std::scoped_lock<decltype(InitMutex)> Lock(InitMutex);
+ checkRuntimeEnvironment();
+ initRuntime(!OffloadPolicy::isOffloadDisabled());
+ if (PM->delayRegisterLib(__tgt_register_lib, Desc))
+ return;
+
+ PM->registerLib(Desc);
+ InitRefCount++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// unloads a target shared library
+EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
+ std::scoped_lock<decltype(InitMutex)> Lock(InitMutex);
+ PM->unregisterLib(Desc);
+
+ if (InitRefCount == 1) {
+ // Interop cleanup should be done before the plugins are deinitialized as
+ // the backend libraries may be already unloaded.
+ if (PM)
+ PM->InteropTbl.clear();
+ }
+ InitRefCount--;
+ deinitRuntime();
+}
+
// If offload is enabled, ensure that device DeviceID has been initialized.
//
// The return bool indicates if the offload is to the host device
@@ -83,34 +124,6 @@ EXTERN void __tgt_register_requires(int64_t Flags) {
__PRETTY_FUNCTION__);
}
-EXTERN void __tgt_rtl_init() { initRuntime(); }
-EXTERN void __tgt_rtl_deinit() { deinitRuntime(); }
-
-////////////////////////////////////////////////////////////////////////////////
-/// adds a target shared library to the target execution image
-EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
- initRuntime();
- if (PM->delayRegisterLib(Desc))
- return;
-
- PM->registerLib(Desc);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Initialize all available devices without registering any image
-EXTERN void __tgt_init_all_rtls() {
- assert(PM && "Runtime not initialized");
- PM->initializeAllDevices();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// unloads a target shared library
-EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
- PM->unregisterLib(Desc);
-
- deinitRuntime();
-}
-
template <typename TargetAsyncInfoTy>
static inline void
targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
@@ -187,7 +200,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
Rc = processAttachEntries(*DeviceOrErr, *StateInfo, AsyncInfo);
if (Rc == OFFLOAD_SUCCESS)
- Rc = AsyncInfo.synchronize();
+ Rc = AsyncInfo.finalize();
}
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
@@ -440,7 +453,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
{ // required to show synchronization
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: synchronize", "", Loc);
if (Rc == OFFLOAD_SUCCESS)
- Rc = AsyncInfo.synchronize();
+ Rc = AsyncInfo.finalize();
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
@@ -552,7 +565,7 @@ EXTERN int __tgt_target_kernel_replay(
LoopTripCount, AsyncInfo, ReplayOutcome);
if (Rc == OFFLOAD_SUCCESS)
- Rc = AsyncInfo.synchronize();
+ Rc = AsyncInfo.finalize();
if (Rc != OFFLOAD_SUCCESS) {
ODBG(ODT_Interface) << "Kernel replay failed in device " << DeviceId;
@@ -626,7 +639,7 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
if (QueryCounter.isAboveThreshold())
AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING;
- if (AsyncInfo->synchronize())
+ if (AsyncInfo->finalize())
FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
// If there are device operations still pending, return immediately without
// deallocating the handle and increase the current thread query count.
@@ -643,10 +656,3 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
delete AsyncInfo;
*AsyncHandle = nullptr;
}
-
-EXTERN void __tgt_register_rpc_callback(unsigned (*Callback)(void *,
- unsigned)) {
- for (auto &Plugin : PM->plugins())
- if (Plugin.is_initialized())
- Plugin.getRPCServer().registerCallback(Callback);
-}
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 6853b7155e3ec..c356c129f0e6b 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -16,6 +16,7 @@
#include "OpenMP/OMPT/Callback.h"
#include "OpenMP/OMPT/Interface.h"
#include "PluginManager.h"
+#include "Shared/APITypes.h"
#include "Shared/Debug.h"
#include "Shared/EnvironmentVar.h"
#include "Shared/Utils.h"
@@ -472,8 +473,13 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
+
+ assert(!(ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG));
+
+ int64_t DataSize = ArgSizes[I];
+
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
+ "HostToDev", "Size=" + std::to_string(DataSize) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -482,7 +488,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
<< "th argument";
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
- int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
+ int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], DataSize,
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
targetDataBegin, StateInfo);
@@ -498,7 +504,6 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void *HstPtrBegin = Args[I];
void *HstPtrBase = ArgsBase[I];
- int64_t DataSize = ArgSizes[I];
map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
// ATTACH map-types are supposed to be handled after all mapping for the
@@ -571,10 +576,12 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
// PTR_AND_OBJ entry is handled below, and so the allocation might fail
// when HasPresentModifier.
PointerTpr = Device.getMappingInfo().getTargetPointer(
- HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *),
+ HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0,
+ static_cast<int64_t>(sizeof(void *)),
/*HstPtrName=*/nullptr,
/*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
- HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
+ HasCloseModifier, HasPresentModifier, HasHoldModifier,
+ /*IsNoCreate=*/false, AsyncInfo,
/*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false);
PointerTgtPtrBegin = PointerTpr.TargetPointer;
IsHostPtr = PointerTpr.Flags.IsHostPointer;
@@ -608,9 +615,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
// Note that HDTTMap will be released in getTargetPointer.
auto TPR = Device.getMappingInfo().getTargetPointer(
- HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
+ HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, ArgSizes[I], HstPtrName,
HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
- HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry(),
+ HasPresentModifier, HasHoldModifier, /*IsNoCreate=*/false, AsyncInfo,
+ PointerTpr.getEntry(),
/*ReleaseHDTTMap=*/true, StateInfo);
void *TgtPtrBegin = TPR.TargetPointer;
IsHostPtr = TPR.Flags.IsHostPointer;
@@ -1068,8 +1076,10 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
continue;
}
- void *HstPtrBegin = Args[I];
+ assert(!(ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG));
int64_t DataSize = ArgSizes[I];
+
+ void *HstPtrBegin = Args[I];
bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
bool UpdateRef = !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
@@ -1182,7 +1192,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
}
}
- int Ret = Device.retrieveData(HstPtr, TgtPtr, Size, AsyncInfo, Entry);
+ int Ret =
+ Device.retrieveData(HstPtr, TgtPtr, Size, AsyncInfo, TPR.getEntry());
if (Ret != OFFLOAD_SUCCESS) {
REPORT() << "Copying data from device failed.";
return OFFLOAD_FAIL;
@@ -1561,43 +1572,9 @@ static bool isLambdaMapping(int64_t Mapping) {
return (Mapping & LambdaMapping) == LambdaMapping;
}
-namespace {
-/// Find the table information in the map or look it up in the translation
-/// tables.
-TableMap *getTableMap(void *HostPtr) {
- std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
- HostPtrToTableMapTy::iterator TableMapIt =
- PM->HostPtrToTableMap.find(HostPtr);
-
- if (TableMapIt != PM->HostPtrToTableMap.end())
- return &TableMapIt->second;
-
- // We don't have a map. So search all the registered libraries.
- TableMap *TM = nullptr;
- std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
- for (HostEntriesBeginToTransTableTy::iterator Itr =
- PM->HostEntriesBeginToTransTable.begin();
- Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
- // get the translation table (which contains all the good info).
- TranslationTable *TransTable = &Itr->second;
- // iterate over all the host table entries to see if we can locate the
- // host_ptr.
- llvm::offloading::EntryTy *Cur = TransTable->HostTable.EntriesBegin;
- for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
- if (Cur->Address != HostPtr)
- continue;
- // we got a match, now fill the HostPtrToTableMap so that we
- // may avoid this search next time.
- TM = &(PM->HostPtrToTableMap)[HostPtr];
- TM->Table = TransTable;
- TM->Index = I;
- return TM;
- }
- }
-
- return nullptr;
-}
+using llvm::offload::getTableMap;
+namespace {
/// A class manages private arguments in a target region.
class PrivateArgumentManagerTy {
/// A data structure for the information of first-private arguments. We can
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index f4e81025a285d..ba3127132b88d 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2684,6 +2684,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ /// Query for the completion of the pending operations on the async info.
+ Expected<QueueStatusTy>
+ queryAsyncStaticImpl(__tgt_async_info &AsyncInfo) override {
+ AMDGPUStreamTy *Stream =
+ reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
+ assert(Stream && "Invalid stream");
+
+ auto CompletedOrErr = Stream->query();
+ if (!CompletedOrErr)
+ return CompletedOrErr.takeError();
+
+ return *CompletedOrErr ? QueueStatusTy::READY : QueueStatusTy::NOT_READY;
+ }
+
/// Pin the host buffer and return the device pointer that should be used for
/// device transfers.
Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 3a354af4775d8..ae147bd09530a 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -825,6 +825,8 @@ class PinnedAllocationMapTy {
}
};
+enum class QueueStatusTy { READY = 0, NOT_READY = 1 };
+
/// Class implementing common functionalities of offload devices. Each plugin
/// should define the specific device class, derive from this generic one, and
/// implement the necessary virtual function members.
@@ -925,6 +927,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo, bool ReleaseQueue,
bool *IsQueueWorkCompleted) = 0;
+ Expected<QueueStatusTy> queryAsyncStatic(__tgt_async_info *AsyncInfo);
+ virtual Expected<QueueStatusTy>
+ queryAsyncStaticImpl(__tgt_async_info &AsyncInfo) = 0;
+
/// Check whether the architecture supports VA management
virtual bool supportVAManagement() const { return false; }
@@ -1003,12 +1009,28 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ /// Submit non-contiguous data to the device (host to device transfer).
+ Error dataNonContigSubmit(void *TgtPtr, const void *HstPtr,
+ const NonContigDescTy &CopyInfo,
+ __tgt_async_info *AsyncInfo);
+ virtual Error dataNonContigSubmitImpl(void *TgtPtr, const void *HstPtr,
+ const NonContigDescTy &CopyInfo,
+ AsyncInfoWrapperTy &AsyncInfoWrapper);
+
/// Retrieve data from the device (device to host transfer).
Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfo);
virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+ /// Retrieve non-contiguous data from the device (device to host transfer).
+ Error dataNonContigRetrieve(void *HstPtr, const void *TgtPtr,
+ const NonContigDescTy &CopyInfo,
+ __tgt_async_info *AsyncInfo);
+ virtual Error dataNonContigRetrieveImpl(void *HstPtr, const void *TgtPtr,
+ const NonContigDescTy &CopyInfo,
+ AsyncInfoWrapperTy &AsyncInfoWrapper);
+
/// Instert a data fence between previous data operations and the following
/// operations if necessary for the device
virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0;
@@ -1605,10 +1627,22 @@ struct GenericPluginTy {
int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
int64_t Size, __tgt_async_info *AsyncInfoPtr);
+ /// Copy non-contiguous data to the given device asynchronously.
+ int32_t data_non_contig_submit_async(int32_t DeviceId, void *TgtPtr,
+ void *HstPtr,
+ const NonContigDescTy &CopyInfo,
+ __tgt_async_info *AsyncInfoPtr);
+
/// Copy data from the given device.
int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
int64_t Size);
+ /// Copy non-contiguous data from the given device asynchronously.
+ int32_t data_non_contig_retrieve_async(int32_t DeviceId, void *HstPtr,
+ void *TgtPtr,
+ const NonContigDescTy &CopyInfo,
+ __tgt_async_info *AsyncInfoPtr);
+
/// Copy data from the given device asynchronously.
int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
int64_t Size, __tgt_async_info *AsyncInfoPtr);
@@ -1632,15 +1666,26 @@ struct GenericPluginTy {
KernelExtraArgsTy *KernelExtraArgs,
__tgt_async_info *AsyncInfoPtr);
+ /// Enqueue a host call into the asynchronous queue.
+ int32_t enqueue_host_call(int32_t DeviceId, void (*Callback)(void *),
+ void *UserData, __tgt_async_info *AsyncInfo);
+
/// Synchronize an asyncrhonous queue with the plugin runtime.
int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+ /// Synchronize an asyncrhonous queue with the plugin runtime without
+ /// releasing it.
+ int32_t synchronize_static(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+
/// Query the current state of an asynchronous queue.
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
/// Obtain information about the given device.
InfoTreeNode obtain_device_info(int32_t DeviceId);
+ /// Query the current state of an asynchronous queue.
+ int32_t query_async_static(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 4a8bf8d257344..0b5b357b68433 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -953,6 +953,15 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo,
return queryAsyncImpl(*AsyncInfo, ReleaseQueue, IsQueueWorkCompleted);
}
+Expected<QueueStatusTy>
+GenericDeviceTy::queryAsyncStatic(__tgt_async_info *AsyncInfo) {
+ if (!AsyncInfo || !AsyncInfo->Queue)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "invalid async info queue");
+
+ return queryAsyncStaticImpl(*AsyncInfo);
+}
+
Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
return Plugin::error(ErrorCode::UNSUPPORTED,
"device does not support VA Management");
@@ -1102,6 +1111,71 @@ Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
return Err;
}
+Error GenericDeviceTy::dataNonContigSubmit(void *TgtPtr, const void *HstPtr,
+ const NonContigDescTy &CopyInfo,
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+ auto Err =
+ dataNonContigSubmitImpl(TgtPtr, HstPtr, CopyInfo, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
+static void dumpContigCopyInfo(const NonContigDescTy &CopyInfo) {
+ for (unsigned I = 0; I < CopyInfo.getRank(); I++)
+ ODBG(OLDT_Init) << " Dim " << I << " : Offset " << CopyInfo.Dims[I].Offset
+ << " Count " << CopyInfo.Dims[I].Count << " Stride "
+ << CopyInfo.Dims[I].Stride << "\n";
+}
+
+template <auto CopyFunc, typename DstPtrTy, typename SrcPtrTy>
+static Error targetDataNonContiguous(GenericDeviceTy &Device, DstPtrTy DstPtr,
+ SrcPtrTy SrcPtr,
+ const NonContigDescTy &CopyInfo,
+ unsigned CurrentDim, uint64_t Offset,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ ODBG(OLDT_Init) << "Non Contig Copy of Dim " << CurrentDim;
+ if (CurrentDim == CopyInfo.getRank()) {
+ ODBG(OLDT_Init) << "Moving non-contiguous chunk with size "
+ << CopyInfo.getLastDimCopySize() << ", " << (void *)SrcPtr
+ << " -> " << (void *)DstPtr << ".\n";
+ return (Device.*CopyFunc)(DstPtr + Offset, SrcPtr + Offset,
+ CopyInfo.getLastDimCopySize(), AsyncInfoWrapper);
+ }
+
+ for (unsigned int I = 0; I < CopyInfo.Dims[CurrentDim].Count; ++I) {
+ uint64_t CurOffset =
+ CopyInfo.Dims[CurrentDim].Offset + I * CopyInfo.Dims[CurrentDim].Stride;
+ // we only need to transfer the first element for the last dimension
+ // since we've already got a contiguous piece.
+ if (CurrentDim != CopyInfo.getRank() - 1 || I == 0) {
+ Error Ret = targetDataNonContiguous<CopyFunc>(
+ Device, DstPtr, SrcPtr, CopyInfo, CurrentDim + 1, Offset + CurOffset,
+ AsyncInfoWrapper);
+ if (Ret)
+ return Ret;
+ }
+ }
+
+ return Error::success();
+}
+
+Error GenericDeviceTy::dataNonContigSubmitImpl(
+ void *TgtPtr, const void *HstPtr, const NonContigDescTy &CopyInfo,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ ODBG(OLDT_Init) << "Non contig descriptor:\n";
+ dumpContigCopyInfo(CopyInfo);
+ NonContigDescTy MergedCopyInfo = CopyInfo;
+ MergedCopyInfo.mergeContiguousDims();
+ ODBG(OLDT_Init) << "Merged non contig descriptor:\n";
+ dumpContigCopyInfo(MergedCopyInfo);
+ return targetDataNonContiguous<&GenericDeviceTy::dataSubmitImpl>(
+ *this, reinterpret_cast<char *>(TgtPtr),
+ reinterpret_cast<const char *>(HstPtr), MergedCopyInfo, /*CurrentDim=*/0,
+ /*Offset=*/0, AsyncInfoWrapper);
+}
+
Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
int64_t Size, __tgt_async_info *AsyncInfo) {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
@@ -1111,6 +1185,32 @@ Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
return Err;
}
+Error GenericDeviceTy::dataNonContigRetrieve(void *HstPtr, const void *TgtPtr,
+ const NonContigDescTy &CopyInfo,
+ __tgt_async_info *AsyncInfo) {
+ AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+
+ auto Err =
+ dataNonContigRetrieveImpl(HstPtr, TgtPtr, CopyInfo, AsyncInfoWrapper);
+ AsyncInfoWrapper.finalize(Err);
+ return Err;
+}
+
+Error GenericDeviceTy::dataNonContigRetrieveImpl(
+ void *HstPtr, const void *TgtPtr, const NonContigDescTy &CopyInfo,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ ODBG(OLDT_Init) << "Non contig descriptor:\n";
+ dumpContigCopyInfo(CopyInfo);
+ NonContigDescTy MergedCopyInfo = CopyInfo;
+ MergedCopyInfo.mergeContiguousDims();
+ ODBG(OLDT_Init) << "Merged non contig descriptor:\n";
+ dumpContigCopyInfo(MergedCopyInfo);
+ return targetDataNonContiguous<&GenericDeviceTy::dataRetrieveImpl>(
+ *this, reinterpret_cast<char *>(HstPtr),
+ reinterpret_cast<const char *>(TgtPtr), MergedCopyInfo, /*CurrentDim=*/0,
+ /*Offset=*/0, AsyncInfoWrapper);
+}
+
Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
void *DstPtr, int64_t Size,
__tgt_async_info *AsyncInfo) {
@@ -1628,6 +1728,22 @@ int32_t GenericPluginTy::data_submit_async(int32_t DeviceId, void *TgtPtr,
return OFFLOAD_SUCCESS;
}
+int32_t GenericPluginTy::data_non_contig_submit_async(
+ int32_t DeviceId, void *TgtPtr, void *HstPtr,
+ const NonContigDescTy &CopyInfo, __tgt_async_info *AsyncInfoPtr) {
+ auto Err = getDevice(DeviceId).dataNonContigSubmit(TgtPtr, HstPtr, CopyInfo,
+ AsyncInfoPtr);
+ if (Err) {
+ REPORT() << "Failure to copy non-contiguous data from device to host."
+ << "Pointers: host "
+ << "= " << HstPtr << ", device = " << TgtPtr << ": "
+ << toString(std::move(Err));
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
int32_t GenericPluginTy::data_retrieve(int32_t DeviceId, void *HstPtr,
void *TgtPtr, int64_t Size) {
return data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
@@ -1649,6 +1765,22 @@ int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr,
return OFFLOAD_SUCCESS;
}
+int32_t GenericPluginTy::data_non_contig_retrieve_async(
+ int32_t DeviceId, void *HstPtr, void *TgtPtr,
+ const NonContigDescTy &CopyInfo, __tgt_async_info *AsyncInfoPtr) {
+ auto Err = getDevice(DeviceId).dataNonContigRetrieve(HstPtr, TgtPtr, CopyInfo,
+ AsyncInfoPtr);
+ if (Err) {
+ REPORT() << "Failure to copy non-contiguous data from device to host."
+ << "Pointers: host "
+ << "= " << HstPtr << ", device = " << TgtPtr << ": "
+ << toString(std::move(Err));
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
int32_t GenericPluginTy::data_exchange(int32_t SrcDeviceId, void *SrcPtr,
int32_t DstDeviceId, void *DstPtr,
int64_t Size) {
@@ -1691,6 +1823,21 @@ int32_t GenericPluginTy::launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
return OFFLOAD_SUCCESS;
}
+int32_t GenericPluginTy::enqueue_host_call(int32_t DeviceId,
+ void (*Callback)(void *),
+ void *UserData,
+ __tgt_async_info *AsyncInfoPtr) {
+ auto Err =
+ getDevice(DeviceId).enqueueHostCall(Callback, UserData, AsyncInfoPtr);
+ if (Err) {
+ REPORT() << "Failure to enqueue host call in device " << DeviceId << ": "
+ << toString(std::move(Err));
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
int32_t GenericPluginTy::synchronize(int32_t DeviceId,
__tgt_async_info *AsyncInfoPtr) {
auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr);
@@ -1703,6 +1850,18 @@ int32_t GenericPluginTy::synchronize(int32_t DeviceId,
return OFFLOAD_SUCCESS;
}
+int32_t GenericPluginTy::synchronize_static(int32_t DeviceId,
+ __tgt_async_info *AsyncInfoPtr) {
+ auto Err = getDevice(DeviceId).synchronize(AsyncInfoPtr, false);
+ if (Err) {
+ REPORT() << "Failure to synchronize stream " << AsyncInfoPtr->Queue << ": "
+ << toString(std::move(Err));
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
int32_t GenericPluginTy::query_async(int32_t DeviceId,
__tgt_async_info *AsyncInfoPtr) {
auto Err = getDevice(DeviceId).queryAsync(AsyncInfoPtr);
@@ -1725,6 +1884,18 @@ InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
return std::move(*InfoOrErr);
}
+int32_t GenericPluginTy::query_async_static(int32_t DeviceId,
+ __tgt_async_info *AsyncInfoPtr) {
+ auto Res = getDevice(DeviceId).queryAsyncStatic(AsyncInfoPtr);
+ if (!Res) {
+ REPORT() << "Failure to query stream " << AsyncInfoPtr->Queue << ": "
+ << toString(Res.takeError());
+ return OFFLOAD_FAIL;
+ }
+
+ return static_cast<int32_t>(*Res);
+}
+
void GenericPluginTy::print_device_info(int32_t DeviceId) {
if (auto Err = getDevice(DeviceId).printInfo())
REPORT() << "Failure to print device " << DeviceId
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 05fdcb032bd29..42e306361535f 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -780,6 +780,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "error in cuStreamQuery: %s");
}
+ Expected<QueueStatusTy>
+ queryAsyncStaticImpl(__tgt_async_info &AsyncInfo) override {
+ CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
+ CUresult Res = cuStreamQuery(Stream);
+
+ if (Res == CUDA_ERROR_NOT_READY)
+ return QueueStatusTy::NOT_READY;
+ if (Res == CUDA_SUCCESS)
+ return QueueStatusTy::READY;
+
+ return Plugin::check(Res, "error in cuStreamQuery: %s");
+ }
+
Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
// TODO: Register the buffer as CUDA host memory.
return HstPtr;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 3f2cc612e4df6..d4c7b95f2f5db 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -328,6 +328,13 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ /// All functions are already synchronous. No need to do anything on this
+ /// query function.
+ Expected<QueueStatusTy>
+ queryAsyncStaticImpl(__tgt_async_info &AsyncInfo) override {
+ return QueueStatusTy::READY;
+ }
+
/// This plugin does not support interoperability
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
return Plugin::success();
More information about the llvm-branch-commits
mailing list