[Openmp-commits] [llvm] [openmp] [OMPT] Upstream device tracing implementation (PR #200165)
Jan Patrick Lehr via Openmp-commits
openmp-commits at lists.llvm.org
Tue Jun 2 06:59:51 PDT 2026
https://github.com/jplehr updated https://github.com/llvm/llvm-project/pull/200165
>From 5a0933c2d81cedd67fcef43c84b90c367365ae4a Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Mon, 13 Apr 2026 07:54:04 -0500
Subject: [PATCH 01/15] [OpenMP][OMPT] Add ompt_set_frame_enter support to
libomptarget connector
Add ompt_set_frame_enter to libomp's target function lookup table so that
libomptarget can retrieve it during OMPT initialization. This function sets
the OMPT frame-enter address and flags on the encountering host thread's task
frame and transitions the thread state to ompt_state_work_parallel, enabling
profiling tools to observe the host thread context when entering a target
region.
The implementation consists of:
- OMPT_FRAME_SET/CLEAR/SET_P macros in ompt-internal.h for frame field access
- __ompt_set_frame_enter_internal() in ompt-specific.cpp with the actual logic
- A static ompt_set_frame_enter() wrapper in ompt-general.cpp registered via
provide_fn() in ompt_libomp_target_fn_lookup()
Made-with: Cursor
---
openmp/runtime/src/ompt-general.cpp | 5 +++++
openmp/runtime/src/ompt-internal.h | 12 ++++++++++++
openmp/runtime/src/ompt-specific.cpp | 15 +++++++++++++++
openmp/runtime/src/ompt-specific.h | 4 ++--
4 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/openmp/runtime/src/ompt-general.cpp b/openmp/runtime/src/ompt-general.cpp
index 959457d380d03..ab0944592bb92 100644
--- a/openmp/runtime/src/ompt-general.cpp
+++ b/openmp/runtime/src/ompt-general.cpp
@@ -892,6 +892,10 @@ static ompt_data_t *ompt_get_target_task_data() {
return __ompt_get_target_task_data();
}
+static int ompt_set_frame_enter(void *addr, int flags, int state) {
+ return __ompt_set_frame_enter_internal(addr, flags, state);
+}
+
/// Lookup function to query libomp callbacks registered by the tool
static ompt_interface_fn_t ompt_libomp_target_fn_lookup(const char *s) {
#define provide_fn(fn) \
@@ -901,6 +905,7 @@ static ompt_interface_fn_t ompt_libomp_target_fn_lookup(const char *s) {
provide_fn(ompt_get_callback);
provide_fn(ompt_get_task_data);
provide_fn(ompt_get_target_task_data);
+ provide_fn(ompt_set_frame_enter);
#undef provide_fn
#define ompt_interface_fn(fn, type, code) \
diff --git a/openmp/runtime/src/ompt-internal.h b/openmp/runtime/src/ompt-internal.h
index 36b45f7a91ea2..7579318f36476 100644
--- a/openmp/runtime/src/ompt-internal.h
+++ b/openmp/runtime/src/ompt-internal.h
@@ -13,6 +13,8 @@
#ifndef __OMPT_INTERNAL_H__
#define __OMPT_INTERNAL_H__
+#include "kmp_platform.h"
+
#include "ompt-event-specific.h"
#include "omp-tools.h"
@@ -24,6 +26,16 @@
((x == fork_context_gnu) ? ompt_parallel_invoker_program \
: ompt_parallel_invoker_runtime)
+#define OMPT_FRAME_SET(frame, which, ptr_value, flags) \
+ { \
+ frame->which##_frame.ptr = ptr_value; \
+ frame->which##_frame_flags = flags; \
+ }
+
+#define OMPT_FRAME_CLEAR(frame, which) OMPT_FRAME_SET(frame, which, 0, 0)
+
+#define OMPT_FRAME_SET_P(frame, which) (frame->which##_frame.ptr != NULL)
+
#define ompt_callback(e) e##_callback
typedef struct ompt_callbacks_internal_s {
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index 94ae2e5293875..5e42401552f70 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -481,6 +481,21 @@ int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
return 0;
}
+//----------------------------------------------------------
+// target region support
+//----------------------------------------------------------
+
+int __ompt_set_frame_enter_internal(void *addr, int flags, int state) {
+ int gtid = __kmp_entry_gtid();
+ kmp_info_t *thr = __kmp_threads[gtid];
+
+ ompt_frame_t *ompt_frame = &OMPT_CUR_TASK_INFO(thr)->frame;
+ OMPT_FRAME_SET(ompt_frame, enter, addr, flags);
+ int old_state = thr->th.ompt_thread_info.state;
+ thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+ return old_state;
+}
+
//----------------------------------------------------------
// team support
//----------------------------------------------------------
diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h
index b7eb140458b40..d6336780d1f37 100644
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -20,10 +20,10 @@
* forward declarations
****************************************************************************/
-/// Entrypoint used by libomptarget to register callbacks in libomp, if not
-/// done already
void __ompt_force_initialization();
+int __ompt_set_frame_enter_internal(void *addr, int flags, int state);
+
void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid);
void __ompt_thread_assign_wait_id(void *variable);
>From 90f5f31ad25e43c05163d1ecacaa9d5970d56e50 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 2 Apr 2026 07:27:58 -0500
Subject: [PATCH 02/15] [Offload] Add GenericProfilerTy abstraction and
APITypes extensions
Introduce a GenericProfilerTy base class that decouples the plugin
layer from OMPT. This allows profiling/tracing backends to be
implemented independently of the plugin code.
Key changes:
- Add GenericProfiler.h/cpp with virtual hooks for device lifecycle
events (init, deinit, loadBinary), kernel launch, data alloc/delete,
kernel completion, and data transfer timing
- Add ProfTimerTy RAII timer for scoped alloc/delete measurements
- Add weak getProfilerToAttach() factory (overridable by OMPT)
- Add ExecAsync and ProfilerData fields to __tgt_async_info
- Replace direct OMPT callback invocations in PluginInterface with
profiler hook calls (handleInit, handleDeinit, handleLoadBinary,
handlePreKernelLaunch)
- Add getDeviceTimeStamp() virtual to GenericDeviceTy
- Add Profiler member and getProfiler() to GenericPluginTy
- Suppress ProfilerData during KLE upload to avoid spurious traces
- Add sync path in AsyncInfoWrapperTy::finalize for !ExecAsync
Made-with: Cursor
---
offload/include/Shared/APITypes.h | 11 ++
offload/plugins-nextgen/common/CMakeLists.txt | 1 +
.../common/include/GenericProfiler.h | 182 ++++++++++++++++++
.../common/include/PluginInterface.h | 35 ++--
.../common/src/GenericProfiler.cpp | 35 ++++
.../common/src/PluginInterface.cpp | 78 +++-----
6 files changed, 280 insertions(+), 62 deletions(-)
create mode 100644 offload/plugins-nextgen/common/include/GenericProfiler.h
create mode 100644 offload/plugins-nextgen/common/src/GenericProfiler.cpp
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 948c12a27107e..a06d5bd490943 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -85,6 +85,17 @@ struct __tgt_async_info {
/// ensure it is a valid location while the transfer to the device is
/// happening.
KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+
+ /// Whether this operation should execute asynchronously. When false, the
+ /// runtime will synchronize the queue after the operation even if a queue
+ /// is present. This allows profiling/tracing to use queue machinery while
+ /// still enforcing synchronous completion boundaries.
+ bool ExecAsync = true;
+
+ /// Opaque handle for profiler-specific data (e.g., OMPT trace record info).
+ /// Owned by the profiler; the runtime threads this pointer through the plugin
+ /// layer to associate async operations with trace records.
+ void *ProfilerData = nullptr;
};
/// This struct contains all of the arguments to a target kernel region launch.
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index aad8d209e931a..ad018da333400 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -11,6 +11,7 @@ add_public_tablegen_target(PluginErrcodes)
# don't want to export `PluginInterface` while `add_llvm_library` requires that.
add_library(PluginCommon OBJECT
src/PluginInterface.cpp
+ src/GenericProfiler.cpp
src/GlobalHandler.cpp
src/JIT.cpp
src/RecordReplay.cpp
diff --git a/offload/plugins-nextgen/common/include/GenericProfiler.h b/offload/plugins-nextgen/common/include/GenericProfiler.h
new file mode 100644
index 0000000000000..ea3c3766554c8
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/GenericProfiler.h
@@ -0,0 +1,182 @@
+//===- GenericProfiler.h - GenericProfiler interface for use in Plugins ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The GenericProfiler interface allows to implement profiler logic for various
+// backends, such as OMPT or other tracing mechanisms.
+// This enables the plugins to be agnostic of the actual high-level language
+// that is implemented.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_PLUGINS_NEXTGEN_COMMON_INCLUDE_GENERICPROFILER_H
+#define OFFLOAD_PLUGINS_NEXTGEN_COMMON_INCLUDE_GENERICPROFILER_H
+
+#include "Shared/APITypes.h"
+
+#include <cstdint>
+#include <functional>
+#include <tuple>
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+struct GenericDeviceTy;
+struct GenericPluginTy;
+class GenericProfilerTy;
+
+template <typename FunT, typename... ArgsT, size_t... IdxSequence>
+void callViaIndexSeq(FunT F, GenericProfilerTy *P, uint64_t StartNanos,
+ uint64_t EndNanos, std::tuple<ArgsT...> Args,
+ std::index_sequence<IdxSequence...>) {
+ F(P, StartNanos, EndNanos, std::get<IdxSequence>(Args)...);
+}
+
+template <typename FunT, typename... ArgsT>
+void callViaUnpack(FunT F, GenericProfilerTy *P, uint64_t StartNanos,
+ uint64_t EndNanos, std::tuple<ArgsT...> Tup) {
+ callViaIndexSeq(F, P, StartNanos, EndNanos, Tup,
+ std::index_sequence_for<ArgsT...>{});
+}
+
+/// Abstraction layer to implement different profiler backends.
+///
+/// The plugins call into the GenericProfilerTy to handle the specific events
+/// with whatever specific backend was instantiated. For now, the supported
+/// backends are limited to an OMPT implementation.
+class GenericProfilerTy {
+public:
+ GenericProfilerTy() = default;
+ virtual ~GenericProfilerTy() = default;
+
+ /// Obtain a pointer to profiler-specific data, if any.
+ virtual void *getProfilerSpecificData() { return nullptr; }
+
+ virtual bool isProfilingEnabled() { return false; }
+
+ /// Set the factors which are used to interpolate the device clock compared to
+ /// the host clock. This follows a simple linear interpolation: Slope * <time>
+ /// + Offset.
+ void setTimeConversionFactors(double Slope, double Offset) {
+ HostToDeviceSlope = Slope;
+ HostToDeviceOffset = Offset;
+ setTimeConversionFactorsImpl(HostToDeviceSlope, HostToDeviceOffset);
+ }
+
+ /// Hook that is called when the plugin is initialized.
+ virtual void handleInit(GenericDeviceTy *Device, GenericPluginTy *Plugin) {}
+
+ /// Hook that is called when the plugin is de-initialized.
+ virtual void handleDeinit(GenericDeviceTy *Device, GenericPluginTy *Plugin) {}
+
+ /// Hook that is called when the device image is loaded.
+ virtual void handleLoadBinary(GenericDeviceTy *Device,
+ GenericPluginTy *Plugin,
+ const StringRef InputTgtImage) {}
+
+ /// Hook that is called when memory is allocated on the device.
+ virtual void handleDataAlloc(uint64_t StartNanos, uint64_t EndNanos,
+ void *HostPtr, uint64_t Size, void *Data) {}
+
+ /// Hook that is called when memory is freed on the device.
+ virtual void handleDataDelete(uint64_t StartNanos, uint64_t EndNanos,
+ void *TgtPtr, void *Data) {}
+
+ /// Hook that is called before launching a kernel.
+ virtual void handlePreKernelLaunch(GenericDeviceTy *Device,
+ uint32_t NumBlocks[3],
+ __tgt_async_info *AI) {}
+
+ /// Hook that is called when the kernel is finished to extract the specific
+ /// timing info for that kernel execution.
+ virtual void handleKernelCompletion(uint64_t StartNanos, uint64_t EndNanos,
+ void *Data) {}
+
+ /// Hook that is called when a data transfer happens to extract timing info
+ /// for that transfer.
+ virtual void handleDataTransfer(uint64_t StartNanos, uint64_t EndNanos,
+ void *Data) {}
+
+ /// Allow factors for time conversion between host and device.
+ virtual void setTimeConversionFactorsImpl(double Slope, double Offset) {}
+
+ /// RAII style timer that measures the elapsed time between construction and
+ /// destruction, then invokes a callback with the profiler, start/end times,
+ /// and any captured arguments.
+ template <typename FnT, typename... ArgsT> class ProfTimerTy {
+ public:
+ ProfTimerTy(FnT &&F, GenericProfilerTy *P, GenericDeviceTy *D, ArgsT... As)
+ : Fun(F), Prof(P), Dev(D), Args(As...) {
+ assert(Prof && "GenericProfilerTy is null");
+ assert(Dev && "GenericDeviceTy is null");
+ if (Prof)
+ StartTime = Prof->getDeviceTimeStamp(Dev);
+ }
+
+ ~ProfTimerTy() {
+ assert(Prof && "GenericProfilerTy is null");
+ assert(Dev && "GenericDeviceTy is null");
+ if (Prof) {
+ uint64_t EndTime = Prof->getDeviceTimeStamp(Dev);
+ callViaUnpack(Fun, Prof, StartTime, EndTime, Args);
+ }
+ }
+
+ private:
+ FnT Fun;
+ GenericProfilerTy *Prof;
+ GenericDeviceTy *Dev;
+ uint64_t StartTime = 0;
+ std::tuple<ArgsT...> Args;
+ };
+
+ template <typename FnT, typename... ArgsT>
+ ProfTimerTy(FnT &&, GenericProfilerTy *, ArgsT...)
+ -> ProfTimerTy<FnT, ArgsT...>;
+
+ template <typename FnT, typename... ArgsT> friend class ProfTimerTy;
+
+ /// Returns an RAII style timer, which will handle data allocation timing.
+ [[nodiscard]] auto getScopedDataAllocTimer(GenericDeviceTy *Dev,
+ void *HostPtr, uint64_t Size,
+ void *ProfData = nullptr) {
+ return ProfTimerTy(
+ [](GenericProfilerTy *P, auto... args) {
+ assert(P && "P was null");
+ P->handleDataAlloc(args...);
+ },
+ this, Dev, HostPtr, Size, ProfData);
+ }
+
+ /// Returns an RAII style timer, which will handle data deletion timing.
+ [[nodiscard]] auto getScopedDataDeleteTimer(GenericDeviceTy *Dev,
+ void *TgtPtr,
+ void *ProfData = nullptr) {
+ return ProfTimerTy(
+ [](GenericProfilerTy *P, auto... args) {
+ assert(P && "P was null");
+ P->handleDataDelete(args...);
+ },
+ this, Dev, TgtPtr, ProfData);
+ }
+
+protected:
+ double HostToDeviceSlope = 1.0;
+ double HostToDeviceOffset = .0;
+
+private:
+ /// Vendor-specific implementation to obtain device time.
+ uint64_t getDeviceTimeStamp(GenericDeviceTy *D);
+};
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OFFLOAD_PLUGINS_NEXTGEN_COMMON_INCLUDE_GENERICPROFILER_H
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f99a0e817fd58..5e29ab576a999 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -29,6 +29,7 @@
#include "Shared/Requirements.h"
#include "Shared/Utils.h"
+#include "GenericProfiler.h"
#include "GlobalHandler.h"
#include "JIT.h"
#include "MemoryManager.h"
@@ -57,6 +58,12 @@
using namespace llvm::offload::debug;
+/// Factory function for creating a profiler. The default (weak) implementation
+/// returns a no-op GenericProfilerTy. When OMPT is enabled, a strong override
+/// returns an OmptProfilerTy instance.
+std::unique_ptr<llvm::omp::target::plugin::GenericProfilerTy>
+getProfilerToAttach();
+
namespace llvm {
namespace omp {
namespace target {
@@ -1120,6 +1127,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
uint32_t getDebugKind() const { return OMPX_DebugKind; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
+ /// Get a device-specific timestamp in nanoseconds, used by the profiler
+ /// for timing device operations. Subclasses should override this to provide
+ /// hardware-accurate timestamps (e.g., via HSA system info).
+ virtual uint64_t getDeviceTimeStamp() { return 0; }
+
/// Get target compute unit kind (e.g., sm_80, or gfx908).
virtual std::string getComputeUnitKind() const { return "unknown"; }
@@ -1376,16 +1388,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// This is used to run the RPC server during task synchronization.
RPCServerTy *RPCServer;
-#ifdef OMPT_SUPPORT
- /// OMPT callback functions
-#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
- FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
-#undef defineOmptCallback
-
- /// Internal representation for OMPT device (initialize & finalize)
- std::atomic<bool> OmptInitialized;
-#endif
-
/// The total per-block native shared memory that a kernel may use.
size_t MaxBlockSharedMemSize = 0;
};
@@ -1397,7 +1399,8 @@ struct GenericPluginTy {
/// Construct a plugin instance.
GenericPluginTy(Triple::ArchType TA)
- : GlobalHandler(nullptr), JIT(TA), RPCServer(nullptr) {}
+ : GlobalHandler(nullptr), JIT(TA), RPCServer(nullptr),
+ Profiler(getProfilerToAttach()) {}
virtual ~GenericPluginTy() {}
@@ -1488,7 +1491,12 @@ struct GenericPluginTy {
/// Tear down any target-specific doorbell resources.
virtual Error deinitRPCDoorbell() { return Plugin::success(); }
- /// Get a reference to the record and replay interface for the plugin.
+ /// Get a pointer to the profiler attached to this plugin.
+ GenericProfilerTy *getProfiler() {
+ assert(Profiler && "Profiler not initialized");
+ return Profiler.get();
+ }
+
/// Initialize a device within the plugin.
Error initDevice(int32_t DeviceId);
@@ -1760,6 +1768,9 @@ struct GenericPluginTy {
/// The interface between the plugin and the GPU for host services.
RPCServerTy *RPCServer;
+
+ /// The profiler backend attached to this plugin (e.g., OMPT).
+ std::unique_ptr<GenericProfilerTy> Profiler;
};
/// Auxiliary interface class for GenericDeviceResourceManagerTy. This class
diff --git a/offload/plugins-nextgen/common/src/GenericProfiler.cpp b/offload/plugins-nextgen/common/src/GenericProfiler.cpp
new file mode 100644
index 0000000000000..7269a8aaf8991
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/GenericProfiler.cpp
@@ -0,0 +1,35 @@
+//===- GenericProfiler.cpp - GenericProfiler implementation ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "GenericProfiler.h"
+#include "PluginInterface.h"
+
+#include <cstdint>
+#include <memory>
+
+__attribute__((weak)) std::unique_ptr<llvm::omp::target::plugin::GenericProfilerTy>
+getProfilerToAttach() {
+ return std::make_unique<llvm::omp::target::plugin::GenericProfilerTy>();
+}
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+uint64_t GenericProfilerTy::getDeviceTimeStamp(GenericDeviceTy *D) {
+ if (D)
+ return D->getDeviceTimeStamp();
+ return 0;
+}
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 1e05c6ae66fdf..1186fd3562be0 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -21,10 +21,7 @@
#include "Utils/ELF.h"
#include "omptarget.h"
-#ifdef OMPT_SUPPORT
-#include "OpenMP/OMPT/Callback.h"
-#include "omp-tools.h"
-#endif
+#include "GenericProfiler.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
@@ -47,7 +44,9 @@ using namespace llvm::offload::debug;
AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
__tgt_async_info *AsyncInfoPtr)
: Device(Device),
- AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
+ AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {
+ this->AsyncInfoPtr->ProfilerData = nullptr;
+}
Error AsyncInfoWrapperTy::synchronize() {
assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized");
@@ -66,6 +65,12 @@ void AsyncInfoWrapperTy::finalize(Error &Err) {
if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
Err = Device.synchronize(&LocalAsyncInfo);
+ // When ExecAsync is false (e.g., for profiling/tracing), synchronize the
+ // queue even if it's not the local async info, so completion callbacks run
+ // before we return to the caller.
+ if (AsyncInfoPtr && !AsyncInfoPtr->ExecAsync && AsyncInfoPtr->Queue && !Err)
+ Err = Device.synchronize(AsyncInfoPtr);
+
// Invalidate the wrapper object.
AsyncInfoPtr = nullptr;
}
@@ -162,9 +167,15 @@ GenericKernelTy::getKernelLaunchEnvironment(
DPxPTR(&LocalKLE), DPxPTR(*AllocOrErr),
sizeof(KernelLaunchEnvironmentTy));
+ // Temporarily suppress ProfilerData so the KLE upload is not traced as
+ // a user data operation.
+ __tgt_async_info *AI = AsyncInfoWrapper;
+ void *SavedProfilerData = AI->ProfilerData;
+ AI->ProfilerData = nullptr;
auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
sizeof(KernelLaunchEnvironmentTy),
AsyncInfoWrapper);
+ AI->ProfilerData = SavedProfilerData;
if (Err)
return Err;
return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
@@ -323,6 +334,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
RRHandle = *RRHandleOrErr;
}
+ GenericDevice.Plugin.getProfiler()->handlePreKernelLaunch(
+ &GenericDevice, EffectiveNumBlocks, AsyncInfoWrapper);
+
if (auto Err = launchImpl(GenericDevice, EffectiveNumThreads,
EffectiveNumBlocks, DynBlockMemConf.NativeSize,
KernelArgs, LaunchParams, AsyncInfoWrapper))
@@ -498,22 +512,6 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
// vendor (u)uid will become available later.
setDeviceUidFromVendorUid(std::to_string(static_cast<uint64_t>(DeviceId)));
-#ifdef OMPT_SUPPORT
- OmptInitialized.store(false);
- // Bind the callbacks to this device's member functions
-#define bindOmptCallback(Name, Type, Code) \
- if (ompt::Initialized && ompt::lookupCallbackByCode) { \
- ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \
- ((ompt_callback_t *)&(Name##_fn))); \
- ODBG(OLDT_Tool) << "OMPT: class bound " << #Name << "=" \
- << ((void *)(uint64_t)Name##_fn); \
- }
-
- FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
-#undef bindOmptCallback
-
-#endif
-
// Envar that indicates whether mapped host buffers should be locked
// automatically. The possible values are boolean (on/off) and a special:
// off: Mapped host buffers are not locked.
@@ -545,17 +543,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
if (auto Err = initImpl(Plugin))
return Err;
-#ifdef OMPT_SUPPORT
- if (ompt::Initialized) {
- bool ExpectedStatus = false;
- if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
- performOmptCallback(device_initialize, Plugin.getUserId(DeviceId),
- /*type=*/getComputeUnitKind().c_str(),
- /*device=*/reinterpret_cast<ompt_device_t *>(this),
- /*lookup=*/ompt::lookupCallbackByName,
- /*documentation=*/nullptr);
- }
-#endif
+ Plugin.getProfiler()->handleInit(this, &Plugin);
// Read and reinitialize the envars that depend on the device initialization.
// Notice these two envars may change the stack size and heap size of the
@@ -645,13 +633,7 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
if (auto Err = RPCServer->deinitDevice(*this))
return Err;
-#ifdef OMPT_SUPPORT
- if (ompt::Initialized) {
- bool ExpectedStatus = true;
- if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
- performOmptCallback(device_finalize, Plugin.getUserId(DeviceId));
- }
-#endif
+ Plugin.getProfiler()->handleDeinit(this, &Plugin);
return deinitImpl();
}
@@ -686,17 +668,7 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
if (auto Err = setupRPCServer(Plugin, *Image))
return std::move(Err);
-#ifdef OMPT_SUPPORT
- if (ompt::Initialized) {
- size_t Bytes = InputTgtImage.size();
- performOmptCallback(
- device_load, Plugin.getUserId(DeviceId),
- /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
- /*ImgSize=*/Bytes,
- /*HostAddr=*/const_cast<unsigned char *>(InputTgtImage.bytes_begin()),
- /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
- }
-#endif
+ Plugin.getProfiler()->handleLoadBinary(this, &Plugin, InputTgtImage);
// Call any global constructors present on the device.
if (auto Err = callGlobalConstructors(Plugin, *Image))
@@ -978,6 +950,9 @@ Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
TargetAllocTy Kind) {
+ auto ProfTimer =
+ Plugin.getProfiler()->getScopedDataAllocTimer(this, HostPtr, Size);
+
void *Alloc = nullptr;
if (RecordReplay && RecordReplay->isRecordingOrReplaying())
@@ -1044,6 +1019,9 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
}
Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
+ auto ProfTimer =
+ Plugin.getProfiler()->getScopedDataDeleteTimer(this, TgtPtr);
+
// Free is a noop when recording or replaying.
if (RecordReplay && RecordReplay->isRecordingOrReplaying())
return RecordReplay->deallocate(TgtPtr);
>From fbdd37d7ddcf13c4baec0f9429f8602cf22c1557 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 2 Apr 2026 07:36:31 -0500
Subject: [PATCH 03/15] [Offload][AMDGPU] Wire HSA profiling into
GenericProfiler abstraction
Add device profiling infrastructure to the AMDGPU plugin so that the
GenericProfiler can receive nanosecond-accurate kernel execution and
data transfer timestamps from the HSA runtime.
Key changes:
- Add ProfilingInfoTy struct to transport HSA profiling data
- Add timeKernelInNsAsync/timeDataTransferInNsAsync callbacks that
extract dispatch/copy times from HSA signals and call
handleKernelCompletion/handleDataTransfer on the profiler
- Add getOrNullProfilerSpecificData helper to extract ProfilerData
from AsyncInfoWrapperTy
- Add getDeviceTimeStamp() override using hsa_system_get_info
- Add getSystemTimestampInNs() for HSA system timestamp queries
- Add schedProfilerKernelTiming/schedProfilerDataTransferTiming to
StreamSlotTy for scheduling profiler callbacks on stream slots
- Thread ProfilerSpecificData through pushKernelLaunch,
pushMemoryCopyH2DAsync, pushMemoryCopyD2HAsync, pushMemoryCopyD2DAsync
- Extract ProfilerSpecificData in dataSubmitImpl, dataRetrieveImpl,
dataExchangeImpl, and launchImpl
- Add stub getDeviceTimeStamp() to CUDA plugin
Made-with: Cursor
---
.../amdgpu/dynamic_hsa/hsa.cpp | 1 +
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 9 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 192 +++++++++++++++++-
offload/plugins-nextgen/cuda/src/rtl.cpp | 3 +
4 files changed, 197 insertions(+), 8 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index 5c7ec186b0ceb..99202574bfecc 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -71,6 +71,7 @@ DLWRAP(hsa_amd_signal_create, 5)
DLWRAP(hsa_amd_signal_async_handler, 5)
DLWRAP(hsa_amd_pointer_info, 5)
DLWRAP(hsa_amd_profiling_get_dispatch_time, 3)
+DLWRAP(hsa_amd_profiling_get_async_copy_time, 2)
DLWRAP(hsa_amd_profiling_set_profiler_enabled, 2)
DLWRAP(hsa_code_object_reader_create_from_memory, 3)
DLWRAP(hsa_code_object_reader_destroy, 1)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index d26f9248e27ef..35b3fc1a594cc 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -206,6 +206,15 @@ hsa_amd_profiling_get_dispatch_time(hsa_agent_t agent, hsa_signal_t signal,
hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t *queue,
int enable);
+typedef struct hsa_amd_profiling_async_copy_time_s {
+ uint64_t start;
+ uint64_t end;
+} hsa_amd_profiling_async_copy_time_t;
+
+hsa_status_t
+hsa_amd_profiling_get_async_copy_time(hsa_signal_t signal,
+ hsa_amd_profiling_async_copy_time_t *time);
+
hsa_status_t hsa_amd_vmem_address_reserve(void **va, size_t size,
uint64_t address, uint64_t flags);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index d1c6d0de11280..460afa94f14c1 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -30,6 +30,7 @@
#include "Shared/Utils.h"
#include "Utils/ELF.h"
+#include "GenericProfiler.h"
#include "GlobalHandler.h"
#include "OffloadAPI.h"
#include "OpenMP/OMPT/Callback.h"
@@ -95,6 +96,76 @@ struct AMDGPUEventManagerTy;
struct AMDGPUDeviceImageTy;
struct AMDGPUMemoryManagerTy;
struct AMDGPUMemoryPoolTy;
+struct AMDGPUSignalTy;
+
+/// Use to transport information to profiler timing functions.
+struct ProfilingInfoTy {
+ GenericPluginTy *Plugin;
+ hsa_agent_t Agent;
+ AMDGPUSignalTy *Signal;
+ double TicksToTime;
+ void *ProfilerSpecificData;
+};
+
+static ProfilingInfoTy *getProfilingInfo(void *Data);
+
+static std::pair<uint64_t, uint64_t>
+getKernelStartAndEndTime(const ProfilingInfoTy *Args);
+
+static std::pair<uint64_t, uint64_t>
+getCopyStartAndEndTime(const ProfilingInfoTy *Args);
+
+static Error timeKernelInNsAsync(void *Data);
+
+static Error timeDataTransferInNsAsync(void *Data) {
+ auto Args = getProfilingInfo(Data);
+ auto [Start, End] = getCopyStartAndEndTime(Args);
+ Args->Plugin->getProfiler()->handleDataTransfer(Start, End,
+ Args->ProfilerSpecificData);
+ return Plugin::success();
+}
+
+static void *
+getOrNullProfilerSpecificData(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+ __tgt_async_info *AI = AsyncInfoWrapper;
+ return AI ? AI->ProfilerData : nullptr;
+}
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+static double setTicksToTime() {
+ uint64_t TicksFrequency = 1;
+ double TicksToTime = 1.0;
+
+ hsa_status_t Status =
+ hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &TicksFrequency);
+ if (Status == HSA_STATUS_SUCCESS)
+ TicksToTime = (double)1e9 / (double)TicksFrequency;
+
+ return TicksToTime;
+}
+
+static double TicksToTime = 1.0;
+
+static void setHSATicksToTimeConstant() { TicksToTime = setTicksToTime(); }
+
+/// Get the current HSA-based system timestamp in nanoseconds.
+static uint64_t getSystemTimestampInNs() {
+ uint64_t TimeStamp = 0;
+ hsa_status_t Status =
+ hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &TimeStamp);
+ if (Status != HSA_STATUS_SUCCESS)
+ return 0;
+ return TimeStamp;
+}
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
namespace hsa_utils {
@@ -1024,6 +1095,7 @@ struct AMDGPUStreamTy {
MemcpyArgsTy MemcpyArgs;
ReleaseBufferArgsTy ReleaseBufferArgs;
ReleaseSignalArgsTy ReleaseSignalArgs;
+ ProfilingInfoTy ProfilerArgs;
void *CallbackArgs;
};
@@ -1061,7 +1133,32 @@ struct AMDGPUStreamTy {
return Plugin::success();
}
- /// Register a callback to be called on compleition
+ /// Schedule kernel timing measurement via the profiler on the slot.
+ Error schedProfilerKernelTiming(GenericDeviceTy *Device, hsa_agent_t Agent,
+ AMDGPUSignalTy *OutputSignal,
+ double TicksToTime,
+ void *ProfilerSpecificData) {
+ Callbacks.emplace_back(timeKernelInNsAsync);
+ ActionArgs.emplace_back().ProfilerArgs =
+ ProfilingInfoTy{&(Device->Plugin), Agent, OutputSignal, TicksToTime,
+ ProfilerSpecificData};
+ return Plugin::success();
+ }
+
+ /// Schedule data transfer timing via the profiler on the slot.
+ Error schedProfilerDataTransferTiming(GenericDeviceTy *Device,
+ hsa_agent_t Agent,
+ AMDGPUSignalTy *OutputSignal,
+ double TicksToTime,
+ void *ProfilerSpecificData) {
+ Callbacks.emplace_back(timeDataTransferInNsAsync);
+ ActionArgs.emplace_back().ProfilerArgs =
+ ProfilingInfoTy{&(Device->Plugin), Agent, OutputSignal, TicksToTime,
+ ProfilerSpecificData};
+ return Plugin::success();
+ }
+
+ /// Register a callback to be called on completion.
Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) {
Callbacks.emplace_back(Func);
ActionArgs.emplace_back().CallbackArgs = Data;
@@ -1086,6 +1183,12 @@ struct AMDGPUStreamTy {
} else if (Callback == releaseSignalAction) {
if (auto Err = releaseSignalAction(&ActionArg))
return Err;
+ } else if (Callback == timeKernelInNsAsync) {
+ if (auto Err = timeKernelInNsAsync(&ActionArg))
+ return Err;
+ } else if (Callback == timeDataTransferInNsAsync) {
+ if (auto Err = timeDataTransferInNsAsync(&ActionArg))
+ return Err;
} else if (Callback) {
if (auto Err = Callback(ActionArg.CallbackArgs))
return Err;
@@ -1357,7 +1460,8 @@ struct AMDGPUStreamTy {
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t GroupSize, uint64_t StackSize,
- AMDGPUMemoryManagerTy &MemoryManager) {
+ AMDGPUMemoryManagerTy &MemoryManager,
+ void *ProfilerSpecificData = nullptr) {
if (Queue == nullptr)
return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
"target queue was nullptr");
@@ -1378,6 +1482,14 @@ struct AMDGPUStreamTy {
if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
return Err;
+#ifdef OMPT_SUPPORT
+ if (ProfilerSpecificData) {
+ if (auto Err = Slots[Curr].schedProfilerKernelTiming(
+ &Device, Agent, OutputSignal, TicksToTime, ProfilerSpecificData))
+ return Err;
+ }
+#endif
+
// If we are running an RPC server we want to wake up the server thread
// whenever there is a kernel running and let it sleep otherwise.
if (Device.getRPCServer())
@@ -1440,7 +1552,8 @@ struct AMDGPUStreamTy {
/// manager once the operation completes.
Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
uint64_t CopySize,
- AMDGPUMemoryManagerTy &MemoryManager) {
+ AMDGPUMemoryManagerTy &MemoryManager,
+ void *ProfilerSpecificData = nullptr) {
// Retrieve available signals for the operation's outputs.
AMDGPUSignalTy *OutputSignals[2] = {};
if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -1503,6 +1616,7 @@ struct AMDGPUStreamTy {
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
uint64_t CopySize,
AMDGPUMemoryManagerTy &MemoryManager,
+ void *ProfilerSpecificData = nullptr,
size_t NumTimes = 1) {
// Retrieve available signals for the operation's outputs.
AMDGPUSignalTy *OutputSignals[2] = {};
@@ -1578,7 +1692,8 @@ struct AMDGPUStreamTy {
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
- hsa_agent_t SrcAgent, uint64_t CopySize) {
+ hsa_agent_t SrcAgent, uint64_t CopySize,
+ void *ProfilerSpecificData = nullptr) {
AMDGPUSignalTy *OutputSignal;
if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
return Err;
@@ -2244,6 +2359,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = initMemoryPools())
return Err;
+ setHSATicksToTimeConstant();
+
char GPUName[64];
if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName))
return Err;
@@ -2576,6 +2693,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Returns the clock frequency for the given AMDGPU device.
uint64_t getClockFrequency() const override { return ClockFrequency; }
+ /// Returns the current HSA system timestamp for profiling.
+ uint64_t getDeviceTimeStamp() override { return getSystemTimestampInNs(); }
+
/// Returns the HSA system timestamp frequency. Zero means unavailable.
uint64_t getSystemTimestampFrequency() const {
return SystemTimestampFrequency;
@@ -2784,6 +2904,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AMDGPUStreamTy *Stream = nullptr;
void *PinnedPtr = nullptr;
+ auto ProfilerSpecificData =
+ getOrNullProfilerSpecificData(AsyncInfoWrapper);
+
// Use one-step asynchronous operation when host memory is already pinned.
if (void *PinnedPtr =
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
@@ -2834,7 +2957,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Err;
return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
- PinnedMemoryManager);
+ PinnedMemoryManager,
+ ProfilerSpecificData);
}
/// Retrieve data from the device (device to host transfer).
@@ -2843,6 +2967,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AMDGPUStreamTy *Stream = nullptr;
void *PinnedPtr = nullptr;
+ auto ProfilerSpecificData =
+ getOrNullProfilerSpecificData(AsyncInfoWrapper);
+
// Use one-step asynchronous operation when host memory is already pinned.
if (void *PinnedPtr =
PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
@@ -2894,7 +3021,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Err;
return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
- PinnedMemoryManager);
+ PinnedMemoryManager,
+ ProfilerSpecificData);
}
/// Exchange data between two devices within the plugin.
@@ -2903,6 +3031,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
+ auto ProfilerSpecificData =
+ getOrNullProfilerSpecificData(AsyncInfoWrapper);
+
// For large transfers use synchronous behavior.
if (Size >= OMPX_MaxAsyncCopyBytes) {
if (AsyncInfoWrapper.hasQueue())
@@ -2931,7 +3062,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
- getAgent(), (uint64_t)Size);
+ getAgent(), (uint64_t)Size,
+ ProfilerSpecificData);
}
/// Insert a data fence between previous data operations and the following
@@ -3009,6 +3141,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr,
PatternSize, PinnedMemoryManager,
+ /*ProfilerSpecificData=*/nullptr,
Size / PatternSize);
}
@@ -4271,10 +4404,12 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// HSA requires the group segment size to include both static and dynamic.
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
+ auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper);
+
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
TotalBlockMemSize, StackSize,
- ArgsMemoryManager);
+ ArgsMemoryManager, ProfilerSpecificData);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
@@ -4441,6 +4576,47 @@ void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
}
+/// Implementation of profiling helper functions.
+static ProfilingInfoTy *getProfilingInfo(void *Data) {
+ return reinterpret_cast<ProfilingInfoTy *>(Data);
+}
+
+static std::pair<uint64_t, uint64_t>
+getKernelStartAndEndTime(const ProfilingInfoTy *Args) {
+ hsa_amd_profiling_dispatch_time_t Time = {};
+ hsa_status_t Status =
+ hsa_amd_profiling_get_dispatch_time(Args->Agent, Args->Signal->get(),
+ &Time);
+ if (Status != HSA_STATUS_SUCCESS)
+ return {0, 0};
+ return {static_cast<uint64_t>(Time.start * Args->TicksToTime),
+ static_cast<uint64_t>(Time.end * Args->TicksToTime)};
+}
+
+static std::pair<uint64_t, uint64_t>
+getCopyStartAndEndTime(const ProfilingInfoTy *Args) {
+ hsa_amd_profiling_async_copy_time_t Time = {};
+ hsa_status_t Status =
+ hsa_amd_profiling_get_async_copy_time(Args->Signal->get(), &Time);
+ if (Status != HSA_STATUS_SUCCESS)
+ return {0, 0};
+ return {static_cast<uint64_t>(Time.start * Args->TicksToTime),
+ static_cast<uint64_t>(Time.end * Args->TicksToTime)};
+}
+
+static Error timeKernelInNsAsync(void *Data) {
+ assert(Data && "Invalid data pointer");
+ auto ProfilerInfo = getProfilingInfo(Data);
+ assert(ProfilerInfo && "Invalid profiling info");
+ assert(ProfilerInfo->ProfilerSpecificData &&
+ "Invalid ProfilerSpecificData");
+
+ auto [StartTime, EndTime] = getKernelStartAndEndTime(ProfilerInfo);
+ ProfilerInfo->Plugin->getProfiler()->handleKernelCompletion(
+ StartTime, EndTime, ProfilerInfo->ProfilerSpecificData);
+ return Plugin::success();
+}
+
} // namespace plugin
} // namespace target
} // namespace omp
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 51e2bdb0c01dc..add99e95b07e9 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1331,6 +1331,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
/// Returns the clock frequency for the given NVPTX device.
uint64_t getClockFrequency() const override { return 1000000000; }
+ /// Device timestamp stub for CUDA - full profiling is a future extension.
+ uint64_t getDeviceTimeStamp() override { return 0; }
+
private:
using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
>From aff6e16fed8235c3e89d7e1cd16df92f44527496 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 2 Apr 2026 07:40:01 -0500
Subject: [PATCH 04/15] [Offload][OMPT] Add OmptProfilerTy, common types, and
plugin tracing shim
Introduce the OMPT-specific profiler backend and shared type
definitions for the device tracing feature.
New files:
- OmptCommonDefs.h: Shared macros (FOREACH_OMPT_DEVICE_TRACING_FN,
OMPT_IF_TRACING_ENABLED, performOmptCallback), function pointer
typedefs for libomptarget_ompt_* entry points
- OmptEventInfoTy.h: Small struct carrying NumTeams and TraceRecord
pointer for async profiler data handoff
- OmptDeviceTracing.h: Plugin-side API surface for device tracing
state management, device ID mapping, clock correlation, and
dynamic symbol loading via ensureFuncPtrLoaded template
- OmptProfiler.h/cpp: OmptProfilerTy inheriting GenericProfilerTy
with OMPT callback bindings and profiler-specific data allocation.
Provides strong override of getProfilerToAttach() factory
- OmptTracing.cpp (plugin side): Shim that dynamically loads
libomptarget_ompt_* symbols, implements ompt_set_trace_ompt,
ompt_start_trace, ompt_stop_trace, ompt_flush_trace,
ompt_advance_buffer_cursor, ompt_get_record_ompt,
ompt_get_device_time, ompt_translate_time
Modified:
- common/CMakeLists.txt: Add PluginOmpt static library target
Made-with: Cursor
---
offload/include/OpenMP/OMPT/OmptCommonDefs.h | 127 +++++++
offload/include/OpenMP/OMPT/OmptEventInfoTy.h | 39 +++
offload/plugins-nextgen/common/CMakeLists.txt | 24 ++
.../common/OMPT/OmptDeviceTracing.h | 133 ++++++++
.../common/OMPT/OmptProfiler.cpp | 163 +++++++++
.../common/OMPT/OmptProfiler.h | 166 ++++++++++
.../common/OMPT/OmptTracing.cpp | 313 ++++++++++++++++++
7 files changed, 965 insertions(+)
create mode 100644 offload/include/OpenMP/OMPT/OmptCommonDefs.h
create mode 100644 offload/include/OpenMP/OMPT/OmptEventInfoTy.h
create mode 100644 offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h
create mode 100644 offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp
create mode 100644 offload/plugins-nextgen/common/OMPT/OmptProfiler.h
create mode 100644 offload/plugins-nextgen/common/OMPT/OmptTracing.cpp
diff --git a/offload/include/OpenMP/OMPT/OmptCommonDefs.h b/offload/include/OpenMP/OMPT/OmptCommonDefs.h
new file mode 100644
index 0000000000000..5391658e80262
--- /dev/null
+++ b/offload/include/OpenMP/OMPT/OmptCommonDefs.h
@@ -0,0 +1,127 @@
+//===------ OmptCommonDefs.h - Common definitions for OMPT --*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common defines and typedefs for OMPT callback and tracing functionality.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_INCLUDE_OMPTCOMMONDEFS_H
+#define OFFLOAD_INCLUDE_OMPTCOMMONDEFS_H
+
+#ifdef OMPT_SUPPORT
+
+#include "omp-tools.h"
+
+#pragma push_macro("DEBUG_PREFIX")
+#undef DEBUG_PREFIX
+#define DEBUG_PREFIX "OMPT"
+
+#define FUNCPTR_TO_PTR(x) ((void *)(uint64_t)x)
+
+#define FOREACH_OMPT_TARGET_CALLBACK(macro) \
+ FOREACH_OMPT_DEVICE_EVENT(macro) \
+ FOREACH_OMPT_NOEMI_EVENT(macro) \
+ FOREACH_OMPT_EMI_EVENT(macro)
+
+// Common device tracing functions
+#define FOREACH_OMPT_DEVICE_TRACING_FN_COMMON(macro) \
+ macro(ompt_set_trace_ompt) macro(ompt_start_trace) macro(ompt_flush_trace) \
+ macro(ompt_stop_trace) macro(ompt_advance_buffer_cursor) \
+ macro(ompt_get_record_type)
+
+// Supported device tracing entry points
+#define FOREACH_OMPT_DEVICE_TRACING_FN(macro) \
+ FOREACH_OMPT_DEVICE_TRACING_FN_COMMON(macro) \
+ macro(ompt_get_record_ompt) macro(ompt_get_device_time) \
+ macro(ompt_translate_time)
+
+// Device tracing functionalities, which are also e.g. coupled to mutexes
+#define FOREACH_OMPT_DEVICE_TRACING_FN_IMPLEMENTAIONS(macro) \
+ FOREACH_OMPT_DEVICE_TRACING_FN_COMMON(macro) \
+ macro(ompt_set_timestamp) macro(ompt_set_granted_teams)
+
+#define OMPT_API_ROUTINE static
+
+#define OMPT_CALLBACK_AVAILABLE(fn) (llvm::omp::target::ompt::Initialized && fn)
+
+#define OMPT_IF_BUILT(stmt) stmt
+
+#define OMPT_IF_ENABLED(stmts) \
+ do { \
+ if (llvm::omp::target::ompt::Initialized) { \
+ stmts \
+ } \
+ } while (0)
+
+#define OMPT_IF_TRACING_ENABLED(stmts) \
+ do { \
+ if (llvm::omp::target::ompt::TracingActive) { \
+ stmts \
+ } \
+ } while (0)
+
+#define OMPT_FRAME_FLAGS (ompt_frame_runtime | OMPT_FRAME_POSITION_DEFAULT)
+
+#if (__PPC64__ | __arm__)
+#define OMPT_GET_FRAME_ADDRESS(level) __builtin_frame_address(level)
+#define OMPT_FRAME_POSITION_DEFAULT ompt_frame_cfa
+#else
+#define OMPT_GET_FRAME_ADDRESS(level) __builtin_frame_address(level)
+#define OMPT_FRAME_POSITION_DEFAULT ompt_frame_framepointer
+#endif
+
+#define OMPT_PTR_UNKNOWN ((void *)0)
+
+#define performIfOmptInitialized(stmt) \
+ do { \
+ if (llvm::omp::target::ompt::Initialized) { \
+ stmt; \
+ } \
+ } while (0)
+
+#define performOmptCallback(CallbackName, ...) \
+ do { \
+ if (ompt_callback_##CallbackName##_fn) \
+ ompt_callback_##CallbackName##_fn(__VA_ARGS__); \
+ } while (0)
+
+typedef ompt_set_result_t (*libomptarget_ompt_set_trace_ompt_t)(
+ int Device, unsigned int Enable, unsigned int EventTy);
+typedef int (*libomptarget_ompt_start_trace_t)(int,
+ ompt_callback_buffer_request_t,
+ ompt_callback_buffer_complete_t);
+typedef int (*libomptarget_ompt_flush_trace_t)(int);
+typedef int (*libomptarget_ompt_stop_trace_t)(int);
+typedef int (*libomptarget_ompt_advance_buffer_cursor_t)(
+ ompt_device_t *, ompt_buffer_t *, size_t, ompt_buffer_cursor_t,
+ ompt_buffer_cursor_t *);
+typedef ompt_get_record_ompt_t libomptarget_ompt_get_record_ompt_t;
+typedef ompt_device_time_t (*libomptarget_ompt_get_device_time_t)(
+ ompt_device_t *);
+typedef ompt_translate_time_t libomptarget_ompt_translate_time_t;
+typedef ompt_device_time_t (*libomptarget_ompt_get_device_time_t)(
+ ompt_device_t *);
+typedef ompt_record_t (*libomptarget_ompt_get_record_type_t)(
+ ompt_buffer_t *, ompt_buffer_cursor_t);
+typedef void (*libomptarget_ompt_set_timestamp_t)(uint64_t start, uint64_t end);
+typedef void (*libomptarget_ompt_set_granted_teams_t)(uint32_t);
+
+/// Function type def used for maintaining unique target region, target
+/// operations ids
+typedef uint64_t (*IdInterfaceTy)();
+
+#pragma pop_macro("DEBUG_PREFIX")
+
+#else
+#define performIfOmptInitialized(stmt)
+#define OMPT_IF_BUILT(stmt)
+#define OMPT_IF_ENABLED(stmts)
+#define OMPT_IF_TRACING_ENABLED(stmts)
+#endif // OMPT_SUPPORT
+
+#endif // OFFLOAD_INCLUDE_OMPTCOMMONDEFS_H
diff --git a/offload/include/OpenMP/OMPT/OmptEventInfoTy.h b/offload/include/OpenMP/OMPT/OmptEventInfoTy.h
new file mode 100644
index 0000000000000..7124b3a3ff501
--- /dev/null
+++ b/offload/include/OpenMP/OMPT/OmptEventInfoTy.h
@@ -0,0 +1,39 @@
+//===- OmptEventInfoTy.h - OMPT specific trace record data ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structure used to communicate OMPT specific profiler data from the
+// high-level libomptarget into the vendor-specific plugins
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_INCLUDE_OPENMP_OMPT_OMPTEVENTINFOTY_H
+#define OFFLOAD_INCLUDE_OPENMP_OMPT_OMPTEVENTINFOTY_H
+
+#include "Shared/Debug.h"
+
+struct ompt_record_ompt_t;
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace ompt {
+
+/// Holds info needed to fill asynchronous trace records
+struct OmptEventInfoTy {
+ /// The granted number of teams at runtime
+ uint64_t NumTeams;
+ /// Pointer to the actual buffer storage location
+ ompt_record_ompt_t *TraceRecord;
+};
+
+} // namespace ompt
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OFFLOAD_INCLUDE_OPENMP_OMPT_OMPTEVENTINFOTY_H
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index ad018da333400..a5154b97c5d0b 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -37,6 +37,30 @@ endif()
include(FindLibcCommonUtils)
target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
+if (OMPT_TARGET_DEFAULT AND LIBOMPTARGET_OMPT_SUPPORT)
+ add_library(PluginOmpt STATIC OMPT/OmptTracing.cpp OMPT/OmptProfiler.cpp)
+ target_include_directories(PluginOmpt PUBLIC
+ OMPT
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${CMAKE_CURRENT_BINARY_DIR}/include
+ ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+ ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
+ ${LIBOMPTARGET_INCLUDE_DIR}
+ )
+ target_compile_options(PluginOmpt PUBLIC ${offload_compile_flags} -fPIC)
+ target_link_options(PluginOmpt PUBLIC ${offload_link_flags})
+ target_compile_definitions(PluginOmpt PRIVATE
+ TARGET_NAME="Profiler"
+ DEBUG_PREFIX="OMPT"
+ )
+ add_dependencies(PluginOmpt PluginErrcodes)
+endif()
+
+if (TARGET flang_rt.runtime.static)
+ target_link_libraries(PluginCommon PRIVATE flang_rt.runtime.static)
+ target_compile_definitions(PluginCommon PRIVATE OFFLOAD_HAS_FLANG_RT)
+endif()
+
# Define the TARGET_NAME and DEBUG_PREFIX.
target_compile_definitions(PluginCommon PRIVATE
TARGET_NAME=PluginInterface
diff --git a/offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h b/offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h
new file mode 100644
index 0000000000000..5744a784825da
--- /dev/null
+++ b/offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h
@@ -0,0 +1,133 @@
+//===- OmptDeviceTracing.h - Target independent OMPT callbacks --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface used by target-independent runtimes to coordinate registration and
+// invocation of OMPT tracing functionality.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_OMPTDEVICETRACING_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_OMPTDEVICETRACING_H
+
+#ifdef OMPT_SUPPORT
+
+#include "OpenMP/OMPT/OmptCommonDefs.h"
+
+#include "llvm/Support/DynamicLibrary.h"
+
+#include <map>
+#include <memory>
+
+#pragma push_macro("DEBUG_PREFIX")
+#undef DEBUG_PREFIX
+#define DEBUG_PREFIX "OMPT"
+
+extern void setOmptAsyncCopyProfile(bool Enable);
+extern void setGlobalOmptKernelProfile(void *Device, int Enable);
+extern uint64_t getSystemTimestampInNs();
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace ompt {
+
+// Declare OMPT device tracing function entry points
+#define declareOmptTracingFn(Name) extern libomptarget_##Name##_t Name##_fn;
+FOREACH_OMPT_DEVICE_TRACING_FN_IMPLEMENTAIONS(declareOmptTracingFn)
+#undef declareOmptTracingFn
+
+// Declare OMPT device tracing function mutexes
+#define declareOmptTracingFnMutex(Name) extern std::mutex Name##_mutex;
+FOREACH_OMPT_DEVICE_TRACING_FN_IMPLEMENTAIONS(declareOmptTracingFnMutex)
+#undef declareOmptTracingFnMutex
+
+extern std::mutex DeviceIdWritingMutex;
+
+/// Activate tracing on the given device
+void enableDeviceTracing(int DeviceId);
+
+/// Deactivate tracing on the given device
+void disableDeviceTracing(int DeviceId);
+
+/// Set 'start' and 'stop' in trace records
+void setOmptTimestamp(uint64_t StartTime, uint64_t EndTime);
+
+/// Set the linear function correlation between host and device clocks
+void setOmptHostToDeviceRate(double Slope, double Offset);
+
+/// Set / store the number of granted teams in trace records
+void setOmptGrantedNumTeams(uint64_t NumTeams);
+
+/// Lookup the given device pointer and return its RTL device ID
+int getDeviceId(ompt_device_t *Device);
+
+/// Map the given device pointer to the given DeviceId
+void setDeviceId(ompt_device_t *Device, int32_t DeviceId);
+
+/// Rempve the given device pointer from the current mapping
+void removeDeviceId(ompt_device_t *Device);
+
+/// Check whether the provided device is currently traced.
+bool isTracedDevice(int32_t DeviceId);
+
+/// Provide name based lookup for the device tracing functions
+extern ompt_interface_fn_t
+lookupDeviceTracingFn(const char *InterfaceFunctionName);
+
+/// Host to device linear clock correlation
+extern double HostToDeviceSlope;
+
+/// Host to device constant clock offset
+extern double HostToDeviceOffset;
+
+/// Mapping of device pointers to their corresponding RTL device ID
+extern std::map<ompt_device_t *, int32_t> Devices;
+
+/// Mapping of RTL device IDs to their currently enabled tracing event types.
+/// Note: Event type '0' (bit position) indicates if this device is traced.
+extern std::map<int32_t, uint64_t> TracedDevices;
+
+/// OMPT global tracing status. Indicates if at least one device is traced.
+extern bool TracingActive;
+
+/// Parent library pointer
+extern std::shared_ptr<llvm::sys::DynamicLibrary> ParentLibrary;
+
+/// Get the parent library by pointer. If it is not already set, it will set the
+/// parent library pointer.
+std::shared_ptr<llvm::sys::DynamicLibrary> getParentLibrary();
+
+/// Set the parent library by filename
+void setParentLibrary(const char *Filename);
+
+/// Search for FuncName inside the parent library and assign to FuncPtr.
+/// IMPORTANT: This function assumes that the *caller* holds the respective lock
+/// for FuncPtr.
+template <typename FT>
+void ensureFuncPtrLoaded(const std::string &FuncName, FT *FuncPtr) {
+ if (*FuncPtr == nullptr) {
+ if ((ParentLibrary == nullptr && getParentLibrary() == nullptr) ||
+ !ParentLibrary->isValid())
+ return;
+ void *SymbolPtr = ParentLibrary->getAddressOfSymbol(FuncName.c_str());
+ if (SymbolPtr == nullptr)
+ return;
+ *FuncPtr = reinterpret_cast<FT>(SymbolPtr);
+ }
+}
+
+} // namespace ompt
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#pragma pop_macro("DEBUG_PREFIX")
+
+#endif // OMPT_SUPPORT
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_OMPTDEVICETRACING_H
diff --git a/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp b/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp
new file mode 100644
index 0000000000000..4b74492352c44
--- /dev/null
+++ b/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp
@@ -0,0 +1,163 @@
+//===- OmptProfiler.cpp - OMPT impl of GenericProfilerTy --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of OmptProfilerTy
+//
+//===----------------------------------------------------------------------===//
+
+#include "OmptProfiler.h"
+#include "OpenMP/OMPT/Interface.h"
+#include "PluginInterface.h"
+#include "Shared/Debug.h"
+
+#include <memory>
+
+using namespace llvm::omp::target;
+
+/// Strong override of getProfilerToAttach() that returns an OmptProfilerTy
+/// when OMPT is enabled. This overrides the weak default in GenericProfiler.cpp.
+std::unique_ptr<llvm::omp::target::plugin::GenericProfilerTy>
+getProfilerToAttach() {
+ return std::make_unique<llvm::omp::target::ompt::OmptProfilerTy>();
+}
+
+void ompt::OmptProfilerTy::handleInit(plugin::GenericDeviceTy *Device,
+ plugin::GenericPluginTy *Plugin) {
+ auto DeviceId = Device->getDeviceId();
+ auto DevicePtr = reinterpret_cast<ompt_device_t *>(Device);
+ ompt::setDeviceId(DevicePtr, Plugin->getUserId(DeviceId));
+
+ if (ompt::Initialized) {
+ bool ExpectedStatus = false;
+ if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
+ performOmptCallback(device_initialize, Plugin->getUserId(DeviceId),
+ /*type=*/Device->getComputeUnitKind().c_str(),
+ /*device=*/DevicePtr,
+ /*lookup=*/ompt::lookupDeviceTracingFn,
+ /*documentation=*/nullptr);
+ }
+}
+
+void ompt::OmptProfilerTy::handleDeinit(
+ plugin::GenericDeviceTy *Device, target::plugin::GenericPluginTy *Plugin) {
+ auto DeviceId = Device->getDeviceId();
+
+ if (ompt::Initialized) {
+ bool ExpectedStatus = true;
+ if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
+ performOmptCallback(device_finalize, Plugin->getUserId(DeviceId));
+ }
+ ompt::removeDeviceId(reinterpret_cast<ompt_device_t *>(Device));
+}
+
+void ompt::OmptProfilerTy::handleLoadBinary(plugin::GenericDeviceTy *Device,
+ plugin::GenericPluginTy *Plugin,
+ const StringRef InputTgtImage) {
+
+ if (!ompt::Initialized)
+ return;
+
+ auto DeviceId = Device->getDeviceId();
+ size_t Bytes = InputTgtImage.size();
+ performOmptCallback(
+ device_load, Plugin->getUserId(DeviceId),
+ /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
+ /*ImgSize=*/Bytes,
+ /*HostAddr=*/const_cast<unsigned char *>(InputTgtImage.bytes_begin()),
+ /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
+}
+
+void ompt::OmptProfilerTy::handleDataAlloc(uint64_t StartNanos,
+ uint64_t EndNanos, void *HostPtr,
+ uint64_t Size, void *Data) {
+ ompt::setOmptTimestamp(StartNanos, EndNanos);
+}
+
+void ompt::OmptProfilerTy::handleDataDelete(uint64_t StartNanos,
+ uint64_t EndNanos, void *TgtPtr,
+ void *Data) {
+ ompt::setOmptTimestamp(StartNanos, EndNanos);
+}
+
+void ompt::OmptProfilerTy::handlePreKernelLaunch(
+ plugin::GenericDeviceTy *Device, uint32_t NumBlocks[3],
+ __tgt_async_info *AI) {
+ if (!ompt::isTracedDevice(
+ ompt::getDeviceId(reinterpret_cast<ompt_device_t *>(Device))))
+ return;
+
+ if (AI->ProfilerData == nullptr)
+ return;
+
+ auto ProfilerSpecificData =
+ reinterpret_cast<ompt::OmptEventInfoTy *>(AI->ProfilerData);
+ assert(ProfilerSpecificData && "Invalid ProfilerSpecificData");
+ // Set number of granted teams for OMPT
+ setOmptGrantedNumTeams(NumBlocks[0]);
+ ProfilerSpecificData->NumTeams = NumBlocks[0];
+}
+
+void ompt::OmptProfilerTy::handleKernelCompletion(uint64_t StartNanos,
+ uint64_t EndNanos,
+ void *Data) {
+
+ if (!isProfilingEnabled())
+ return;
+
+ /// Empty data means no tracing in OMPT
+ /// offload/include/OpenMP/OMPT/Interface.h line 492
+ if (!Data)
+ return;
+
+ ODBG(ODT_Tool) << "OMPT-Async: Time kernel for asynchronous execution: Start "
+ << StartNanos << " End " << EndNanos;
+
+ auto OmptEventInfo = reinterpret_cast<ompt::OmptEventInfoTy *>(Data);
+ assert(OmptEventInfo && "Invalid OmptEventInfo");
+ assert(OmptEventInfo->TraceRecord && "Invalid TraceRecord");
+
+ ompt::RegionInterface.stopTargetSubmitTraceAsync(OmptEventInfo->TraceRecord,
+ OmptEventInfo->NumTeams,
+ StartNanos, EndNanos);
+
+ // Done processing, our responsibility to free the memory
+ freeProfilerDataEntry(OmptEventInfo);
+}
+
+void ompt::OmptProfilerTy::handleDataTransfer(uint64_t StartNanos,
+ uint64_t EndNanos, void *Data) {
+
+ if (!isProfilingEnabled())
+ return;
+
+ /// Empty data means no tracing in OMPT
+ /// offload/include/OpenMP/OMPT/Interface.h line 492
+ if (!Data)
+ return;
+
+ ODBG(ODT_Tool) << "OMPT-Async: Time data for asynchronous execution: Start "
+ << StartNanos << " End " << EndNanos;
+
+ auto OmptEventInfo = reinterpret_cast<ompt::OmptEventInfoTy *>(Data);
+ assert(OmptEventInfo && "Invalid OmptEventInfo");
+ assert(OmptEventInfo->TraceRecord && "Invalid TraceRecord");
+
+ ompt::RegionInterface.stopTargetDataMovementTraceAsync(
+ OmptEventInfo->TraceRecord, StartNanos, EndNanos);
+
+ // Done processing, our responsibility to free the memory
+ freeProfilerDataEntry(OmptEventInfo);
+}
+
+bool ompt::OmptProfilerTy::isProfilingEnabled() { return ompt::TracingActive; }
+
+void ompt::OmptProfilerTy::setTimeConversionFactorsImpl(double Slope,
+ double Offset) {
+ ODBG(ODT_Tool) << "Using Time Slope: " << Slope << " and Offset: " << Offset;
+ setOmptHostToDeviceRate(Slope, Offset);
+}
diff --git a/offload/plugins-nextgen/common/OMPT/OmptProfiler.h b/offload/plugins-nextgen/common/OMPT/OmptProfiler.h
new file mode 100644
index 0000000000000..9a12d0eda4a53
--- /dev/null
+++ b/offload/plugins-nextgen/common/OMPT/OmptProfiler.h
@@ -0,0 +1,166 @@
+//===- OmptProfiler.h - OMPT specific impl of GenericProfilerTy -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OMPT specific implementation of the GenericProfilerTy class.
+// This class uses the already existing implementation of OMPT to invoke
+// callbacks and perform tracing.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_PLUGINS_NEXTGEN_COMMON_OMPT_OMPTPROFILERTY_H
+#define OFFLOAD_PLUGINS_NEXTGEN_COMMON_OMPT_OMPTPROFILERTY_H
+
+#include "GenericProfiler.h"
+
+#include "OmptDeviceTracing.h"
+#include "OpenMP/OMPT/Callback.h"
+#include "OpenMP/OMPT/OmptEventInfoTy.h"
+#include "Shared/Debug.h"
+#include "omp-tools.h"
+
+#include <functional>
+#include <tuple>
+
+#pragma push_macro("DEBUG_PREFIX")
+#undef DEBUG_PREFIX
+#define DEBUG_PREFIX "OMPT"
+
+extern uint64_t getSystemTimestampInNs();
+
+using namespace llvm::omp::target::debug;
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+struct GenericDeviceTy;
+struct GenericPluginTy;
+class GenericProfilerTy;
+
+} // namespace plugin
+
+namespace ompt {
+
+// From Callback.h / Callback.cpp
+extern bool Initialized;
+
+/**
+ * Implements an OMPT backend for the Profiler interface used in the plugins.
+ *
+ * Forwards / Implements the different generic hooks with OMPT semantics.
+ */
+class OmptProfilerTy : public plugin::GenericProfilerTy {
+public:
+ /** Public members **/
+ OmptProfilerTy() {
+
+ OmptInitialized.store(false);
+ // Bind the callbacks to this device's member functions
+#define bindOmptCallback(Name, Type, Code) \
+ if (ompt::Initialized && ompt::lookupCallbackByCode) { \
+ ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \
+ ((ompt_callback_t *)&(Name##_fn))); \
+ ODBG(ODT_Tool) << "class bound " << #Name \
+ << "=" << ((void *)(uint64_t)Name##_fn); \
+ }
+
+ FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
+#undef bindOmptCallback
+
+#define bindOmptTracingFunction(FunctionName) \
+ if (ompt::Initialized && ompt::lookupDeviceTracingFn) { \
+ FunctionName##_fn = ompt::lookupDeviceTracingFn(#FunctionName); \
+ ODBG(ODT_Tool) << "device tracing fn bound " << #FunctionName \
+ << "=" << ((void *)(uint64_t)FunctionName##_fn); \
+ }
+
+ FOREACH_OMPT_DEVICE_TRACING_FN_COMMON(bindOmptTracingFunction);
+#undef bindOmptTracingFunction
+ }
+
+ bool isProfilingEnabled() override;
+
+ void handleInit(plugin::GenericDeviceTy *Device,
+ plugin::GenericPluginTy *Plugin) override;
+
+ void handleDeinit(plugin::GenericDeviceTy *Device,
+ plugin::GenericPluginTy *Plugin) override;
+
+ void handleLoadBinary(plugin::GenericDeviceTy *Device,
+ plugin::GenericPluginTy *Plugin,
+ const StringRef InputTgtImage) override;
+
+ void handleDataAlloc(uint64_t StartNanos, uint64_t EndNanos, void *HostPtr,
+ uint64_t Size, void *Data) override;
+ void handleDataDelete(uint64_t StartNanos, uint64_t EndNanos, void *TgtPtr,
+ void *Data) override;
+
+ void handlePreKernelLaunch(plugin::GenericDeviceTy *Device,
+ uint32_t NumBlocks[3],
+ __tgt_async_info *AI) override;
+
+ void handleKernelCompletion(uint64_t StartNanos, uint64_t EndNanos,
+ void *Data) override;
+
+ void handleDataTransfer(uint64_t StartNanos, uint64_t EndNanos,
+ void *Data) override;
+
+ void setTimeConversionFactorsImpl(double Slope, double Offset) override;
+
+ void *getProfilerSpecificData() override {
+ // TODO: This is ID is not used currently
+ uint64_t Id = OmptProfDataId.fetch_add(1);
+ {
+ std::scoped_lock Lock(ProfilerDataMutex);
+ ProfilerData[Id] = std::make_unique<OmptEventInfoTy>();
+ return ProfilerData[Id].get();
+ }
+ }
+
+ void freeProfilerDataEntry(OmptEventInfoTy *DataPtr) {
+ std::scoped_lock Lock(ProfilerDataMutex);
+
+ for (auto &Entry : ProfilerData)
+ if (Entry.second.get() == DataPtr) {
+ ProfilerData.erase(Entry.first);
+ break;
+ }
+ }
+
+private:
+ /// Holds a unique ID for each allocation of OmptEventInfoTy
+ std::atomic<uint64_t> OmptProfDataId{0};
+
+ /// Holds memory used to store OMPT specific data and pass it down from
+ /// libomptarget into the plugins.
+ std::map<uint64_t, std::unique_ptr<OmptEventInfoTy>> ProfilerData;
+
+ /// Lock to guard STL ProfilerData map
+ std::mutex ProfilerDataMutex;
+
+ /// OMPT callback functions
+#define defineOmptCallback(Name, Type, Code) Name##_t Name##_fn = nullptr;
+ FOREACH_OMPT_DEVICE_EVENT(defineOmptCallback)
+#undef defineOmptCallback
+
+ /// OMPT device tracing functions
+#define defineOmptTracingFunction(Name) ompt_interface_fn_t Name##_fn = nullptr;
+ FOREACH_OMPT_DEVICE_TRACING_FN_COMMON(defineOmptTracingFunction);
+#undef defineOmptTracingFunction
+
+ /// Internal representation for OMPT device (initialize & finalize)
+ std::atomic<bool> OmptInitialized;
+};
+} // namespace ompt
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#pragma pop_macro("DEBUG_PREFIX")
+
+#endif
diff --git a/offload/plugins-nextgen/common/OMPT/OmptTracing.cpp b/offload/plugins-nextgen/common/OMPT/OmptTracing.cpp
new file mode 100644
index 0000000000000..d5be9b8d9cd8a
--- /dev/null
+++ b/offload/plugins-nextgen/common/OMPT/OmptTracing.cpp
@@ -0,0 +1,313 @@
+//===-- OmptTracing.cpp - Target independent OpenMP target RTL --- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of OMPT tracing interfaces for PluginInterface
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef OMPT_SUPPORT
+
+#include "Shared/Debug.h"
+#include "OmptDeviceTracing.h"
+#include "omp-tools.h"
+
+#include "llvm/Support/DynamicLibrary.h"
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <mutex>
+
+#pragma push_macro("DEBUG_PREFIX")
+#undef DEBUG_PREFIX
+#define DEBUG_PREFIX "OMPT"
+
+// Define OMPT device tracing function entry points
+#define defineOmptTracingFn(Name) \
+ libomptarget_##Name##_t llvm::omp::target::ompt::Name##_fn = nullptr;
+FOREACH_OMPT_DEVICE_TRACING_FN_IMPLEMENTAIONS(defineOmptTracingFn)
+#undef defineOmptTracingFn
+
+// Define OMPT device tracing function mutexes
+#define defineOmptTracingFnMutex(Name) \
+ std::mutex llvm::omp::target::ompt::Name##_mutex;
+FOREACH_OMPT_DEVICE_TRACING_FN_IMPLEMENTAIONS(defineOmptTracingFnMutex)
+#undef defineOmptTracingFnMutex
+
+std::mutex llvm::omp::target::ompt::DeviceIdWritingMutex;
+
+using namespace llvm::omp::target::ompt;
+using namespace llvm::omp::target::debug;
+
+std::shared_ptr<llvm::sys::DynamicLibrary>
+ llvm::omp::target::ompt::ParentLibrary(nullptr);
+
+double llvm::omp::target::ompt::HostToDeviceSlope = .0;
+double llvm::omp::target::ompt::HostToDeviceOffset = .0;
+
+std::map<ompt_device_t *, int32_t> llvm::omp::target::ompt::Devices;
+
+std::shared_ptr<llvm::sys::DynamicLibrary>
+llvm::omp::target::ompt::getParentLibrary() {
+ static bool ParentLibraryAssigned = false;
+ if (!ParentLibraryAssigned) {
+ setParentLibrary("libomptarget.so");
+ ParentLibraryAssigned = true;
+ }
+ return ParentLibrary;
+}
+
+void llvm::omp::target::ompt::setParentLibrary(const char *Filename) {
+ if (ParentLibrary)
+ return;
+ std::string ErrorMsg;
+ ParentLibrary = std::make_shared<llvm::sys::DynamicLibrary>(
+ llvm::sys::DynamicLibrary::getPermanentLibrary(Filename, &ErrorMsg));
+ if ((ParentLibrary == nullptr) || (!ParentLibrary->isValid()))
+ REPORT() << "Failed to set parent library: " << ErrorMsg.c_str();
+}
+
+int llvm::omp::target::ompt::getDeviceId(ompt_device_t *Device) {
+ // Block other threads, which might trigger an erase (for the same device)
+ std::unique_lock<std::mutex> Lock(DeviceIdWritingMutex);
+ auto DeviceIterator = Devices.find(Device);
+ if (Device == nullptr || DeviceIterator == Devices.end()) {
+ REPORT() << "Failed to get ID for Device=" << Device;
+ return -1;
+ }
+ return DeviceIterator->second;
+}
+
+void llvm::omp::target::ompt::setDeviceId(ompt_device_t *Device,
+ int32_t DeviceId) {
+ assert(Device && "Mapping device ID to nullptr is not allowed");
+ if (Device == nullptr || DeviceId < 0) {
+ REPORT() << "Failed to set ID=%d for Device=" << DeviceId << Device;
+ return;
+ }
+ std::unique_lock<std::mutex> Lock(DeviceIdWritingMutex);
+ auto DeviceIterator = Devices.find(Device);
+ if (DeviceIterator != Devices.end()) {
+ auto CurrentDeviceId = DeviceIterator->second;
+ if (DeviceId == CurrentDeviceId) {
+ REPORT() << "Tried to duplicate OMPT Device= " << Device << " ID=" << DeviceId;
+ } else {
+ REPORT() << "Tried to overwrite OMPT Device=" << Device << " (ID=" << CurrentDeviceId << " with new ID=" << DeviceId;
+ }
+ return;
+ }
+ Devices.emplace(Device, DeviceId);
+}
+
+void llvm::omp::target::ompt::removeDeviceId(ompt_device_t *Device) {
+ int DeviceId = getDeviceId(Device);
+ if (DeviceId < 0) {
+ REPORT() << "Tried to remove Device= " << Device << " ID=" << DeviceId;
+ return;
+ }
+ std::unique_lock<std::mutex> Lock(DeviceIdWritingMutex);
+ Devices.erase(Device);
+ TracedDevices.erase(DeviceId);
+}
+
+OMPT_API_ROUTINE ompt_set_result_t ompt_set_trace_ompt(ompt_device_t *Device,
+ unsigned int Enable,
+ unsigned int EventTy) {
+ ODBG(ODT_Tool) << "Executing ompt_set_trace_ompt";
+
+ int DeviceId = getDeviceId(Device);
+ if (DeviceId < 0) {
+ REPORT() << "Failed to set trace events for Device=" << Device <<
+ " (Unknown device) [Enable=" << Enable << " EventTy=" << EventTy;
+ return ompt_set_never;
+ }
+
+ std::unique_lock<std::mutex> Lock(ompt_set_trace_ompt_mutex);
+ ensureFuncPtrLoaded<libomptarget_ompt_set_trace_ompt_t>(
+ "libomptarget_ompt_set_trace_ompt", &ompt_set_trace_ompt_fn);
+ assert(ompt_set_trace_ompt_fn && "libomptarget_ompt_set_trace_ompt loaded");
+ return ompt_set_trace_ompt_fn(DeviceId, Enable, EventTy);
+}
+
+OMPT_API_ROUTINE int
+ompt_start_trace(ompt_device_t *Device, ompt_callback_buffer_request_t Request,
+ ompt_callback_buffer_complete_t Complete) {
+ ODBG(ODT_Tool) << "Executing ompt_start_trace";
+
+ int DeviceId = getDeviceId(Device);
+ if (DeviceId < 0) {
+ REPORT() << "Failed to start trace for Device=" << Device << " (Unknown device";
+ // Indicate failure
+ return 0;
+ }
+
+ {
+ // Protect the function pointer
+ std::unique_lock<std::mutex> Lock(ompt_start_trace_mutex);
+
+ if (Request && Complete) {
+ llvm::omp::target::ompt::enableDeviceTracing(DeviceId);
+ // Enable asynchronous memory copy profiling
+ setOmptAsyncCopyProfile(/*Enable=*/true);
+ // Enable queue dispatch profiling
+ if (DeviceId >= 0)
+ setGlobalOmptKernelProfile(Device, /*Enable=*/1);
+ else
+ REPORT() << "May not enable kernel profiling for invalid device id=" <<
+ DeviceId;
+ }
+
+ // Call libomptarget specific function
+ ensureFuncPtrLoaded<libomptarget_ompt_start_trace_t>(
+ "libomptarget_ompt_start_trace", &ompt_start_trace_fn);
+ assert(ompt_start_trace_fn && "libomptarget_ompt_start_trace loaded");
+ }
+ return ompt_start_trace_fn(DeviceId, Request, Complete);
+}
+
+OMPT_API_ROUTINE int ompt_flush_trace(ompt_device_t *Device) {
+ ODBG(ODT_Tool) << "Executing ompt_flush_trace";
+
+ std::unique_lock<std::mutex> Lock(ompt_flush_trace_mutex);
+ ensureFuncPtrLoaded<libomptarget_ompt_flush_trace_t>(
+ "libomptarget_ompt_flush_trace", &ompt_flush_trace_fn);
+ assert(ompt_flush_trace_fn && "libomptarget_ompt_flush_trace loaded");
+ return ompt_flush_trace_fn(getDeviceId(Device));
+}
+
+OMPT_API_ROUTINE int ompt_stop_trace(ompt_device_t *Device) {
+ ODBG(ODT_Tool) << "Executing ompt_stop_trace";
+
+ int DeviceId = getDeviceId(Device);
+ if (DeviceId < 0) {
+ REPORT() << "Failed to stop trace for Device=" << Device << " (Unknown device)";
+ // Indicate failure
+ return 0;
+ }
+
+ {
+ // Protect the function pointer
+ std::unique_lock<std::mutex> Lock(ompt_stop_trace_mutex);
+ llvm::omp::target::ompt::disableDeviceTracing(DeviceId);
+ // Disable asynchronous memory copy profiling
+ setOmptAsyncCopyProfile(/*Enable=*/false);
+ // Disable queue dispatch profiling
+ if (DeviceId >= 0)
+ setGlobalOmptKernelProfile(Device, /*Enable=*/0);
+ else
+ REPORT() << "May not disable kernel profiling for invalid device id=" <<
+ DeviceId;
+ ensureFuncPtrLoaded<libomptarget_ompt_stop_trace_t>(
+ "libomptarget_ompt_stop_trace", &ompt_stop_trace_fn);
+ assert(ompt_stop_trace_fn && "libomptarget_ompt_stop_trace loaded");
+ }
+ return ompt_stop_trace_fn(DeviceId);
+}
+
+OMPT_API_ROUTINE ompt_record_ompt_t *
+ompt_get_record_ompt(ompt_buffer_t *Buffer, ompt_buffer_cursor_t CurrentPos) {
+ // TODO In debug mode, get the metadata associated with this buffer
+ // and assert that there are enough bytes for the current record
+
+ // Currently, no synchronization required since a disjoint set of
+ // trace records is handed over to a thread.
+
+ // Note that CurrentPos can be nullptr. In that case, we return
+ // nullptr. The tool has to handle that properly.
+ return (ompt_record_ompt_t *)CurrentPos;
+}
+
+OMPT_API_ROUTINE int ompt_advance_buffer_cursor(ompt_device_t *Device,
+ ompt_buffer_t *Buffer,
+ size_t Size,
+ ompt_buffer_cursor_t CurrentPos,
+ ompt_buffer_cursor_t *NextPos) {
+ // Note: The input parameter size is unused here. It refers to the
+ // bytes returned in the corresponding callback.
+ // Advance can be called concurrently, so synchronize setting the
+ // function pointer. The actual libomptarget function does not need
+ // to be synchronized since it must be working on logically disjoint
+ // buffers.
+ std::unique_lock<std::mutex> Lock(ompt_advance_buffer_cursor_mutex);
+ ensureFuncPtrLoaded<libomptarget_ompt_advance_buffer_cursor_t>(
+ "libomptarget_ompt_advance_buffer_cursor",
+ &ompt_advance_buffer_cursor_fn);
+ assert(ompt_advance_buffer_cursor_fn &&
+ "libomptarget_ompt_advance_buffer_cursor loaded");
+ return ompt_advance_buffer_cursor_fn(Device, Buffer, Size, CurrentPos,
+ NextPos);
+}
+
+OMPT_API_ROUTINE ompt_record_t
+ompt_get_record_type(ompt_buffer_t *Buffer, ompt_buffer_cursor_t CurrentPos) {
+ std::unique_lock<std::mutex> Lock(ompt_get_record_type_mutex);
+ ensureFuncPtrLoaded<libomptarget_ompt_get_record_type_t>(
+ "libomptarget_ompt_get_record_type", &ompt_get_record_type_fn);
+ assert(ompt_get_record_type_fn && "libomptarget_ompt_get_record_type loaded");
+ return ompt_get_record_type_fn(Buffer, CurrentPos);
+}
+
+OMPT_API_ROUTINE ompt_device_time_t
+ompt_get_device_time(ompt_device_t *Device) {
+ ODBG(ODT_Tool) << "Executing ompt_get_device_time";
+ return getSystemTimestampInNs();
+}
+
+OMPT_API_ROUTINE double ompt_translate_time(ompt_device_t *Device,
+ ompt_device_time_t DeviceTime) {
+ // Translate a device time to a meaningful timepoint in host time
+ // We do not need to account for clock-skew / drift. So simple linear
+ // translation using the host to device rate we obtained.
+ double TranslatedTime = DeviceTime * HostToDeviceSlope + HostToDeviceOffset;
+ ODBG(ODT_Tool) << "D2H translated time: " << TranslatedTime;
+
+ return TranslatedTime;
+}
+
+void llvm::omp::target::ompt::setOmptTimestamp(uint64_t StartTime,
+ uint64_t EndTime) {
+ std::unique_lock<std::mutex> Lock(ompt_set_timestamp_mutex);
+ ensureFuncPtrLoaded<libomptarget_ompt_set_timestamp_t>(
+ "libomptarget_ompt_set_timestamp", &ompt_set_timestamp_fn);
+ // No need to hold a lock
+ ompt_set_timestamp_fn(StartTime, EndTime);
+}
+
+void llvm::omp::target::ompt::setOmptHostToDeviceRate(double Slope,
+ double Offset) {
+ HostToDeviceSlope = Slope;
+ HostToDeviceOffset = Offset;
+}
+
+void llvm::omp::target::ompt::setOmptGrantedNumTeams(uint64_t NumTeams) {
+ std::unique_lock<std::mutex> Lock(ompt_set_granted_teams_mutex);
+ ensureFuncPtrLoaded<libomptarget_ompt_set_granted_teams_t>(
+ "libomptarget_ompt_set_granted_teams", &ompt_set_granted_teams_fn);
+ // No need to hold a lock
+ ompt_set_granted_teams_fn(NumTeams);
+}
+
+ompt_interface_fn_t llvm::omp::target::ompt::lookupDeviceTracingFn(
+ const char *InterfaceFunctionName) {
+#define compareAgainst(AvailableFunction) \
+ if (strcmp(InterfaceFunctionName, #AvailableFunction) == 0) \
+ return (ompt_interface_fn_t)AvailableFunction;
+
+ FOREACH_OMPT_DEVICE_TRACING_FN(compareAgainst);
+#undef compareAgainst
+
+ ODBG(ODT_Tool) << "Warning: Could not find requested function "
+ << InterfaceFunctionName;
+ return (ompt_interface_fn_t) nullptr;
+}
+
+#pragma pop_macro("DEBUG_PREFIX")
+
+#endif // OMPT_SUPPORT
>From 1375d333e9dcbb011ef6c721a8e844e10a67a5f9 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 2 Apr 2026 07:40:55 -0500
Subject: [PATCH 05/15] [Offload][OMPT] Add tracing extensions to Interface.h
Extend the OMPT Interface class with trace record creation methods and
the async-aware TracerInterfaceRAII for device tracing support.
Key changes to Interface.h:
- Add includes for OmptEventInfoTy.h, APITypes.h, GenericProfiler.h
- Add extern TracingActive flag and isTracingEnabled() declaration
- Add trace start/stop methods for all target operations:
startTargetDataAllocTrace, stopTargetDataAllocTrace,
startTargetDataSubmitTrace, startTargetDataDeleteTrace,
startTargetDataRetrieveTrace, stopTargetDataMovementTraceAsync,
startTargetSubmitTrace, stopTargetSubmitTraceAsync, and
corresponding methods for enter/exit/update/target regions
- Add getTraceGenerators<>() template overloads parallel to
existing getCallbacks<>()
- Add private helpers: setTraceRecordCommon, setTraceRecordTargetDataOp,
setTraceRecordTargetKernel, setTraceRecordTarget, announceTargetRegion
- Add TracerInterfaceRAII class: async-aware RAII that checks
isTracingEnabled, calls begin trace generator, allocates
OmptEventInfoTy via profiler, sets AsyncInfo->ProfilerData
- Rename CallbackPairTy to FunctionPairTy in InterfaceRAII
- Make InvokeInterfaceFunction return a value
- Replace OMPT_IF_BUILT with OMPT_IF_TRACING_ENABLED
Key changes to Callback.h:
- Replace inline macro/typedef definitions with include of
OmptCommonDefs.h (shared definitions moved there in PR3)
Made-with: Cursor
---
offload/include/OpenMP/OMPT/Callback.h | 26 +--
offload/include/OpenMP/OMPT/Interface.h | 257 ++++++++++++++++++++++--
2 files changed, 245 insertions(+), 38 deletions(-)
diff --git a/offload/include/OpenMP/OMPT/Callback.h b/offload/include/OpenMP/OMPT/Callback.h
index 9d545c643223f..7aff68ced0d79 100644
--- a/offload/include/OpenMP/OMPT/Callback.h
+++ b/offload/include/OpenMP/OMPT/Callback.h
@@ -16,34 +16,12 @@
#ifdef OMPT_SUPPORT
-#include "omp-tools.h"
+#include "OmptCommonDefs.h"
#pragma push_macro("DEBUG_PREFIX")
#undef DEBUG_PREFIX
#define DEBUG_PREFIX "OMPT"
-#define FOREACH_OMPT_TARGET_CALLBACK(macro) \
- FOREACH_OMPT_DEVICE_EVENT(macro) \
- FOREACH_OMPT_NOEMI_EVENT(macro) \
- FOREACH_OMPT_EMI_EVENT(macro)
-
-#define performIfOmptInitialized(stmt) \
- do { \
- if (llvm::omp::target::ompt::Initialized) { \
- stmt; \
- } \
- } while (0)
-
-#define performOmptCallback(CallbackName, ...) \
- do { \
- if (ompt_callback_##CallbackName##_fn) \
- ompt_callback_##CallbackName##_fn(__VA_ARGS__); \
- } while (0)
-
-/// Function type def used for maintaining unique target region, target
-/// operations ids
-typedef uint64_t (*IdInterfaceTy)();
-
namespace llvm {
namespace omp {
namespace target {
@@ -98,8 +76,6 @@ extern bool Initialized;
#pragma pop_macro("DEBUG_PREFIX")
-#else
-#define performIfOmptInitialized(stmt)
#endif // OMPT_SUPPORT
#endif // OFFLOAD_INCLUDE_OPENMP_OMPT_CALLBACK_H
diff --git a/offload/include/OpenMP/OMPT/Interface.h b/offload/include/OpenMP/OMPT/Interface.h
index 6961641769b76..bc022227ac899 100644
--- a/offload/include/OpenMP/OMPT/Interface.h
+++ b/offload/include/OpenMP/OMPT/Interface.h
@@ -16,14 +16,21 @@
// Only provide functionality if target OMPT support is enabled
#ifdef OMPT_SUPPORT
#include "Callback.h"
+#include "OmptEventInfoTy.h"
+#include "Shared/APITypes.h"
+#include "Shared/Debug.h"
#include "omp-tools.h"
+#include "GenericProfiler.h"
+
#include "llvm/Support/ErrorHandling.h"
#include <functional>
#include <tuple>
-#define OMPT_IF_BUILT(stmt) stmt
+#pragma push_macro("DEBUG_PREFIX")
+#undef DEBUG_PREFIX
+#define DEBUG_PREFIX "OMPT"
/// Callbacks for target regions require task_data representing the
/// encountering task.
@@ -31,6 +38,7 @@
/// target_task_data representing the target task region.
typedef ompt_data_t *(*ompt_get_task_data_t)();
typedef ompt_data_t *(*ompt_get_target_task_data_t)();
+typedef int (*ompt_set_frame_enter_t)(void *Address, int Flags, int State);
namespace llvm {
namespace omp {
@@ -41,10 +49,19 @@ namespace ompt {
/// target_task_data.
static ompt_get_task_data_t ompt_get_task_data_fn;
static ompt_get_target_task_data_t ompt_get_target_task_data_fn;
+static ompt_set_frame_enter_t ompt_set_frame_enter_fn;
+
+/// OMPT global tracing status. Indicates if at least one device is traced.
+extern bool TracingActive;
+
+/// Check if this device traces the given event type
+extern bool isTracingEnabled(int DeviceId, unsigned int EventTy);
/// Used to maintain execution state for this thread
class Interface {
public:
+ // Target data callbacks
+
/// Top-level function for invoking callback before device data allocation
void beginTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
void **TgtPtrBegin, size_t Size, void *Code);
@@ -142,6 +159,85 @@ class Interface {
/// Top-level function for invoking callback after target construct
void endTarget(int64_t DeviceId, void *Code);
+ // Target data tracing
+
+ /// Top-level function for starting trace before device data allocation
+ void startTargetDataAllocTrace(int64_t DeviceId, void *HstPtrBegin,
+ void **TgtPtrBegin, size_t Size, void *Code);
+
+ /// Top-level function for stopping trace after device data allocation
+ ompt_record_ompt_t *stopTargetDataAllocTrace(int64_t DeviceId,
+ void *HstPtrBegin,
+ void **TgtPtrBegin, size_t Size,
+ void *Code);
+
+ /// Top-level function for starting trace before data submit
+ ompt_record_ompt_t *startTargetDataSubmitTrace(int64_t SrcDeviceId,
+ void *SrcPtrBegin,
+ int64_t DstDeviceId,
+ void *DstPtrBegin, size_t Size,
+ void *Code);
+
+ /// Top-level function for starting trace before device data deallocation
+ void startTargetDataDeleteTrace(int64_t DeviceId, void *TgtPtrBegin,
+ void *Code);
+
+ /// Top-level function for stopping trace after device data deallocation
+ ompt_record_ompt_t *stopTargetDataDeleteTrace(int64_t DeviceId,
+ void *TgtPtrBegin, void *Code);
+
+ /// Top-level function for starting trace before data retrieve
+ ompt_record_ompt_t *startTargetDataRetrieveTrace(int64_t SrcDeviceId,
+ void *SrcPtrBegin,
+ int64_t DstDeviceId,
+ void *DstPtrBegin,
+ size_t Size, void *Code);
+
+ ompt_record_ompt_t *
+ stopTargetDataMovementTraceAsync(ompt_record_ompt_t *DataPtr,
+ uint64_t NanosStart, uint64_t NanosEnd);
+
+ /// Top-level function for starting trace before kernel dispatch
+ ompt_record_ompt_t *startTargetSubmitTrace(int64_t DeviceId,
+ unsigned int NumTeams = 1);
+
+ ompt_record_ompt_t *stopTargetSubmitTraceAsync(ompt_record_ompt_t *DataPtr,
+ unsigned int NumTeams,
+ uint64_t NanosStart,
+ uint64_t NanosStop);
+
+ // Target region tracing
+
+ /// Top-level function for starting trace before target enter data
+ /// construct
+ ompt_record_ompt_t *startTargetDataEnterTrace(int64_t DeviceId, void *Code);
+
+ /// Top-level function for stopping trace after target enter data
+ /// construct
+ ompt_record_ompt_t *stopTargetDataEnterTrace(int64_t DeviceId, void *Code);
+
+ /// Top-level function for starting trace before target exit data
+ /// construct
+ ompt_record_ompt_t *startTargetDataExitTrace(int64_t DeviceId, void *Code);
+
+ /// Top-level function for stopping trace after target exit data
+ /// construct
+ ompt_record_ompt_t *stopTargetDataExitTrace(int64_t DeviceId, void *Code);
+
+ /// Top-level function for starting trace before target update construct
+ ompt_record_ompt_t *startTargetUpdateTrace(int64_t DeviceId, void *Code);
+
+ /// Top-level function for stopping trace after target update construct
+ ompt_record_ompt_t *stopTargetUpdateTrace(int64_t DeviceId, void *Code);
+
+ // Target kernel tracing
+
+ /// Top-level function for starting trace before target construct
+ ompt_record_ompt_t *startTargetTrace(int64_t DeviceId, void *Code);
+
+ /// Top-level function for stopping trace after target construct
+ ompt_record_ompt_t *stopTargetTrace(int64_t DeviceId, void *Code);
+
// Callback getter: Target data operations
template <ompt_target_data_op_t OpType> auto getCallbacks() {
if constexpr (OpType == ompt_target_data_alloc ||
@@ -217,6 +313,69 @@ class Interface {
llvm_unreachable("Unhandled target operation!");
}
+ // Callback getter: Target data operations
+ template <ompt_target_data_op_t OpType> auto getTraceGenerators() {
+ if constexpr (OpType == ompt_target_data_alloc ||
+ OpType == ompt_target_data_alloc_async)
+ return std::make_pair(std::mem_fn(&Interface::startTargetDataAllocTrace),
+ std::mem_fn(&Interface::stopTargetDataAllocTrace));
+
+ if constexpr (OpType == ompt_target_data_delete ||
+ OpType == ompt_target_data_delete_async)
+ return std::make_pair(std::mem_fn(&Interface::startTargetDataDeleteTrace),
+ std::mem_fn(&Interface::stopTargetDataDeleteTrace));
+
+ if constexpr (OpType == ompt_target_data_transfer_to_device ||
+ OpType == ompt_target_data_transfer_to_device_async)
+ return std::make_pair(
+ std::mem_fn(&Interface::startTargetDataSubmitTrace),
+ std::mem_fn(&Interface::stopTargetDataMovementTraceAsync));
+
+ if constexpr (OpType == ompt_target_data_transfer_from_device ||
+ OpType == ompt_target_data_transfer_from_device_async)
+ return std::make_pair(
+ std::mem_fn(&Interface::startTargetDataRetrieveTrace),
+ std::mem_fn(&Interface::stopTargetDataMovementTraceAsync));
+
+ llvm_unreachable("Unhandled target data operation type!");
+ }
+
+ // Callback getter: Target region operations
+ template <ompt_target_t OpType> auto getTraceGenerators() {
+ if constexpr (OpType == ompt_target_enter_data ||
+ OpType == ompt_target_enter_data_nowait)
+ return std::make_pair(std::mem_fn(&Interface::startTargetDataEnterTrace),
+ std::mem_fn(&Interface::stopTargetDataEnterTrace));
+
+ if constexpr (OpType == ompt_target_exit_data ||
+ OpType == ompt_target_exit_data_nowait)
+ return std::make_pair(std::mem_fn(&Interface::startTargetDataExitTrace),
+ std::mem_fn(&Interface::stopTargetDataExitTrace));
+
+ if constexpr (OpType == ompt_target_update ||
+ OpType == ompt_target_update_nowait)
+ return std::make_pair(std::mem_fn(&Interface::startTargetUpdateTrace),
+ std::mem_fn(&Interface::stopTargetUpdateTrace));
+
+ if constexpr (OpType == ompt_target || OpType == ompt_target_nowait)
+ return std::make_pair(std::mem_fn(&Interface::startTargetTrace),
+ std::mem_fn(&Interface::stopTargetTrace));
+
+ llvm_unreachable("Unknown target region operation type!");
+ }
+
+ // Callback getter: Kernel launch operation
+ template <ompt_callbacks_t OpType> auto getTraceGenerators() {
+ // We use 'ompt_callbacks_t', because no other enum is currently available
+ // to model a kernel launch / target submit operation.
+ if constexpr (OpType == ompt_callback_target_submit)
+ return std::make_pair(
+ std::mem_fn(&Interface::startTargetSubmitTrace),
+ std::mem_fn(&Interface::stopTargetSubmitTraceAsync));
+
+ llvm_unreachable("Unhandled target operation!");
+ }
+
/// Setters for target region and target operation correlation ids
void setTargetDataValue(uint64_t DataValue) { TargetData.value = DataValue; }
void setTargetDataPtr(void *DataPtr) { TargetData.ptr = DataPtr; }
@@ -240,6 +399,9 @@ class Interface {
/// Target task data representing the target task region
ompt_data_t *TargetTaskData = nullptr;
+ /// Used for marking begin of a data operation
+ void announceTargetRegion(const char *RegionName);
+
/// Used for marking begin of a data operation
void beginTargetDataOperation();
@@ -251,6 +413,23 @@ class Interface {
/// Used for marking end of a target region
void endTargetRegion();
+
+ // Called by all trace generation routines
+ void setTraceRecordCommon(ompt_record_ompt_t *DataPtr,
+ ompt_callbacks_t CallbackType);
+ // Type specific helpers
+ void setTraceRecordTargetDataOp(ompt_record_target_data_op_t *Record,
+ ompt_target_data_op_t DataOpType,
+ void *SrcAddr, int64_t SrcDeviceNum,
+ void *DstAddr, int64_t DstDeviceNum,
+ size_t Bytes, void *CodePtr);
+
+ void setTraceRecordTargetKernel(ompt_record_target_kernel_t *Record,
+ unsigned int NumTeams);
+
+ void setTraceRecordTarget(ompt_record_target_t *Record, int64_t DeviceId,
+ ompt_target_t TargetKind,
+ ompt_scope_endpoint_t Endpoint, void *CodePtr);
};
/// Thread local state for target region and associated metadata
@@ -262,14 +441,14 @@ extern thread_local Interface RegionInterface;
extern thread_local void *ReturnAddress;
template <typename FuncTy, typename ArgsTy, size_t... IndexSeq>
-void InvokeInterfaceFunction(FuncTy Func, ArgsTy Args,
+auto InvokeInterfaceFunction(FuncTy Func, ArgsTy Args,
std::index_sequence<IndexSeq...>) {
- std::invoke(Func, RegionInterface, std::get<IndexSeq>(Args)...);
+ return std::invoke(Func, RegionInterface, std::get<IndexSeq>(Args)...);
}
-template <typename CallbackPairTy, typename... ArgsTy> class InterfaceRAII {
+template <typename FunctionPairTy, typename... ArgsTy> class InterfaceRAII {
public:
- InterfaceRAII(CallbackPairTy Callbacks, ArgsTy... Args)
+ InterfaceRAII(FunctionPairTy Callbacks, ArgsTy... Args)
: Arguments(Args...), beginFunction(std::get<0>(Callbacks)),
endFunction(std::get<1>(Callbacks)) {
performIfOmptInitialized(begin());
@@ -290,14 +469,66 @@ template <typename CallbackPairTy, typename... ArgsTy> class InterfaceRAII {
}
std::tuple<ArgsTy...> Arguments;
- typename CallbackPairTy::first_type beginFunction;
- typename CallbackPairTy::second_type endFunction;
+ typename FunctionPairTy::first_type beginFunction;
+ typename FunctionPairTy::second_type endFunction;
};
// InterfaceRAII's class template argument deduction guide
-template <typename CallbackPairTy, typename... ArgsTy>
-InterfaceRAII(CallbackPairTy Callbacks, ArgsTy... Args)
- -> InterfaceRAII<CallbackPairTy, ArgsTy...>;
+template <typename FunctionPairTy, typename... ArgsTy>
+InterfaceRAII(FunctionPairTy Callbacks, ArgsTy... Args)
+ -> InterfaceRAII<FunctionPairTy, ArgsTy...>;
+
+/// Similar to the original InterfaceRAII this class is used for tracing and
+/// extends the original with async capabilities. That is: It takes an
+/// additional AsyncInfo reference as argument to populate the relevant fields.
+/// The AsyncInfoTy propagates the info into the RTL / plugins.
+/// TracedDeviceId represents the trace record's device affinity. EventType is
+/// the callback type that needs to be enabled via ompt_set_trace_ompt.
+template <typename FunctionPairTy, typename AsyncInfoTy, typename... ArgsTy>
+class TracerInterfaceRAII {
+public:
+ TracerInterfaceRAII(FunctionPairTy Callbacks, AsyncInfoTy &AsyncInfo,
+ plugin::GenericProfilerTy *Prof, int TracedDeviceId,
+ ompt_callbacks_t EventType, ArgsTy... Args)
+ : Arguments(Args...), beginFunction(std::get<0>(Callbacks)) {
+ __tgt_async_info *AI = AsyncInfo;
+ if (isTracingEnabled(TracedDeviceId, EventType)) {
+ auto Record = begin();
+
+ // The Profiler can allocate specific data to be used to pass information
+ // from here to lower parts of the runtime system.
+ // NOTE: It is the responsibility of the programmer to ensure type
+ // compatibility and correct usage of the data. The profiler, however,
+ // OWNS the pointer and frees it at an appropriate time.
+ OmptEventInfoTy *ProfilerData =
+ reinterpret_cast<OmptEventInfoTy *>(Prof->getProfilerSpecificData());
+ ProfilerData->TraceRecord = Record;
+ ProfilerData->NumTeams = 0;
+
+ // Allows to pass down into the plugins via AsyncInfoTy
+ AI->ProfilerData = ProfilerData;
+ } else {
+ // Actively prevent further tracing of this event
+ AI->ProfilerData = nullptr;
+ }
+ }
+
+private:
+ auto begin() {
+ auto IndexSequence =
+ std::make_index_sequence<std::tuple_size_v<decltype(Arguments)>>{};
+ return InvokeInterfaceFunction(beginFunction, Arguments, IndexSequence);
+ }
+
+ std::tuple<ArgsTy...> Arguments;
+ typename FunctionPairTy::first_type beginFunction;
+ /// No end-function here, since the end is called asynchronously from the
+ /// plugins, once the operation has completed.
+};
+
+template <typename FunctionPairTy, typename... ArgsTy>
+TracerInterfaceRAII(FunctionPairTy Callbacks, ArgsTy... Args)
+ -> TracerInterfaceRAII<FunctionPairTy, ArgsTy...>;
/// Used to set and reset the thread-local return address. The RAII is expected
/// to be created at a runtime entry point when the return address should be
@@ -335,8 +566,8 @@ class ReturnAddressSetterRAII {
// The getter returns the address stored in the thread local variable.
#define OMPT_GET_RETURN_ADDRESS llvm::omp::target::ompt::ReturnAddress
-#else
-#define OMPT_IF_BUILT(stmt)
-#endif
+#pragma pop_macro("DEBUG_PREFIX")
+
+#endif // OMPT_SUPPORT
#endif // OFFLOAD_INCLUDE_OPENMP_OMPT_INTERFACE_H
>From 403eb72739506b311ab711a74f0bf60840f56c34 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 2 Apr 2026 07:44:36 -0500
Subject: [PATCH 06/15] [Offload][OMPT] Add OmptTracingBufferMgr trace buffer
infrastructure
Add the trace buffer management subsystem for OMPT device tracing. This
manages allocation, population, and flushing of trace record buffers
between OpenMP worker threads and helper threads.
New files:
- OmptTracingBuffer.h: OmptTracingBufferMgr class with Buffer metadata,
FlushInfo, TraceRecord types, thread-local per-device buffer pointers,
helper thread management, and cursor-based record allocation
- OmptTracingBuffer.cpp: Full implementation of buffer lifecycle:
assignCursor (lock-free fast path for existing buffers, locked path
for new allocations), triggerFlushOnBufferFull, driveCompletion
(helper thread main loop), invokeCallbacks, flushBuffer (dispatches
buffer-completion callbacks for ranges of ready records),
flushAllBuffers, helper thread start/shutdown/flush-and-shutdown
- OmptTracing.h: Libomptarget-side declarations for tracing state,
buffer management callback registration, device tracing control,
timestamp/clock correlation, and thread-local trace record fields.
Implementation deferred to subsequent PR.
Changes to existing files:
- PluginManager.h: Add OmptTracingBufferMgr* TraceRecordManager member
with getTraceRecordManager() accessor, include OmptTracingBuffer.h
- libomptarget/CMakeLists.txt: Add OmptTracingBuffer.cpp to omptarget
Key design points:
- Per-thread buffer assignment avoids contention on the common path
- Helper threads process buffers in flush-id order via condition variables
- LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL / FLUSH_ON_SHUTDOWN envars
control automatic flushing behavior
- Buffer states: init -> ready -> released track record lifecycle
- Flush states: waiting -> processing prevent concurrent helper access
Made-with: Cursor
---
offload/include/OpenMP/OMPT/OmptTracing.h | 154 ++++
.../include/OpenMP/OMPT/OmptTracingBuffer.h | 412 ++++++++++
offload/include/PluginManager.h | 10 +-
offload/libomptarget/CMakeLists.txt | 1 +
.../OpenMP/OMPT/OmptTracingBuffer.cpp | 763 ++++++++++++++++++
5 files changed, 1339 insertions(+), 1 deletion(-)
create mode 100644 offload/include/OpenMP/OMPT/OmptTracing.h
create mode 100644 offload/include/OpenMP/OMPT/OmptTracingBuffer.h
create mode 100644 offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp
diff --git a/offload/include/OpenMP/OMPT/OmptTracing.h b/offload/include/OpenMP/OMPT/OmptTracing.h
new file mode 100644
index 0000000000000..2a892582923d5
--- /dev/null
+++ b/offload/include/OpenMP/OMPT/OmptTracing.h
@@ -0,0 +1,154 @@
+//===---- OmptTracing.h - Target independent OMPT callbacks --*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface used by target-independent runtimes to coordinate registration and
+// invocation of OMPT tracing functionality.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_INCLUDE_OMPTTRACING_H
+#define OPENMP_LIBOMPTARGET_INCLUDE_OMPTTRACING_H
+
+#ifdef OMPT_SUPPORT
+
+#include <unordered_map>
+
+#include "OmptCommonDefs.h"
+#include "OmptTracingBuffer.h"
+
+#pragma push_macro("DEBUG_PREFIX")
+#undef DEBUG_PREFIX
+#define DEBUG_PREFIX "OMPT"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace ompt {
+
+/// After a timestamp has been read, reset it.
+void resetTimestamp(uint64_t *T);
+
+/// A tool may register unique buffer-request and buffer-completion
+/// callback functions for a device. The following are utility functions to
+/// manage those functions.
+
+/// Given a device-id, return the corresponding buffer-request callback
+/// function.
+ompt_callback_buffer_request_t getBufferRequestFn(int DeviceId);
+
+/// Give a device-id, return the corresponding buffer-completion callback
+/// function.
+ompt_callback_buffer_complete_t getBufferCompleteFn(int DeviceId);
+
+/// Given a device-id, set the corresponding buffer-request and
+/// buffer-completion callback functions.
+void setBufferManagementFns(int DeviceId, ompt_callback_buffer_request_t ReqFn,
+ ompt_callback_buffer_complete_t CmpltFn);
+
+/// Given a device-id, remove the corresponding buffer-request and
+/// buffer-completion callback functions.
+void removeBufferManagementFns(int DeviceId);
+
+/// Is device tracing stopped for all devices?
+bool isAllDeviceTracingStopped();
+
+/// Invoke callback function for buffer request events
+void ompt_callback_buffer_request(int DeviceId, ompt_buffer_t **BufferPtr,
+ size_t *Bytes);
+
+/// Invoke callback function for buffer complete events
+void ompt_callback_buffer_complete(int DeviceId, ompt_buffer_t *Buffer,
+ size_t Bytes,
+ ompt_buffer_cursor_t BeginCursor,
+ int BufferOwned);
+
+/// Set 'start' and 'stop' for the current trace record
+void setOmptTimestamp(uint64_t StartTime, uint64_t EndTime);
+
+/// Set the linear function correlation between host and device clocks
+void setOmptHostToDeviceRate(double Slope, double Offset);
+
+/// Set / store the number of granted teams
+void setOmptGrantedNumTeams(uint64_t NumTeams);
+
+/// Check if (1) tracing is globally active (2) the given device is actively
+/// traced and (3) the given event type is traced on the device
+bool isTracingEnabled(int DeviceId, unsigned int EventTy);
+
+/// Check if the given device is actively traced
+bool isTracedDevice(int DeviceId);
+
+/// Check if the given device is monitoring the provided tracing type
+bool isTracingTypeEnabled(int DeviceId, unsigned int EventTy);
+
+/// Check if the given device is monitoring the provided tracing type 'group'
+/// Where group means we will check for both: EMI and non-EMI event types
+bool isTracingTypeGroupEnabled(int DeviceId, unsigned int EventTy);
+
+/// Set whether the given tracing type should be monitored (or not) on the
+/// device
+void setTracingTypeEnabled(uint64_t &TracedEventTy, bool Enable,
+ unsigned int EventTy);
+
+/// Set / reset the given tracing types (EventTy = 0 corresponds to 'all')
+ompt_set_result_t setTraceEventTy(int DeviceId, unsigned int Enable,
+ unsigned int EventTy);
+
+/// Return thread id
+uint64_t getThreadId();
+
+/// See TracedDevices in OmptDeviceTracing.h
+extern std::map<int32_t, uint64_t> TracedDevices;
+/// Activate tracing on the given device
+void enableDeviceTracing(int DeviceId);
+/// Deactivate tracing on the given device
+void disableDeviceTracing(int DeviceId);
+
+/// Mutexes to serialize invocation of device registration and checks
+extern std::mutex DeviceAccessMutex;
+
+/// Mutexes to serialize invocation of device-independent entry points
+extern std::mutex TraceAccessMutex;
+extern std::mutex TraceControlMutex;
+
+/// Ensure serialization of calls to std::hash
+extern std::mutex TraceHashThreadMutex;
+
+/// Protect map from device-id to the corresponding buffer-request and
+/// buffer-completion callback functions.
+extern std::mutex BufferManagementFnMutex;
+
+/// Map from device-id to the corresponding buffer-request and buffer-completion
+/// callback functions.
+extern std::unordered_map<int, std::pair<ompt_callback_buffer_request_t,
+ ompt_callback_buffer_complete_t>>
+ BufferManagementFns;
+
+/// Thread local variables used by the plugin to communicate OMPT information
+/// that are then used to populate trace records. This method assumes a
+/// synchronous implementation, otherwise it won't work.
+extern thread_local uint32_t TraceRecordNumGrantedTeams;
+extern thread_local uint64_t TraceRecordStartTime;
+extern thread_local uint64_t TraceRecordStopTime;
+
+/// Thread local thread-id.
+extern thread_local uint64_t ThreadId;
+
+/// OMPT global tracing status. Indicates if at least one device is traced.
+extern bool TracingActive;
+
+} // namespace ompt
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#pragma pop_macro("DEBUG_PREFIX")
+
+#endif // OMPT_SUPPORT
+
+#endif // OPENMP_LIBOMPTARGET_INCLUDE_OMPTTRACING_H
diff --git a/offload/include/OpenMP/OMPT/OmptTracingBuffer.h b/offload/include/OpenMP/OMPT/OmptTracingBuffer.h
new file mode 100644
index 0000000000000..5c9f4bf33dae8
--- /dev/null
+++ b/offload/include/OpenMP/OMPT/OmptTracingBuffer.h
@@ -0,0 +1,412 @@
+//===- OmptTracingBuffer.h - Target independent OpenMP target RTL -- C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used for generating and flushing OMPT device trace records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_OMPTTRACINGBUFFER_H
+#define OPENMP_LIBOMPTARGET_OMPTTRACINGBUFFER_H
+
+#ifdef OMPT_SUPPORT
+
+#include <atomic>
+#include <cassert>
+#include <condition_variable>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <omp-tools.h>
+
+#include "Shared/EnvironmentVar.h"
+
+// Maximum number of devices supported in device tracing. No device tracing
+// will be performed for any device-id larger than 1023.
+#define MAX_NUM_DEVICES 1024
+
+// TODO Start with 1 helper thread and add dynamically if required
+// Number of helper threads must not execeed 32 since the
+// thread-wait-tracker is 32 bits in length.
+#define OMPT_NUM_HELPER_THREADS 1
+
+/*
+ * Buffer manager for trace records generated by OpenMP master and
+ * worker threads. During device init, a tool may register a
+ * buffer-request and a buffer-completion callback. The buffer-request
+ * callback should be used to allocate new buffers as required. The
+ * buffer-complete callback should be used to return trace records to
+ * the tool.
+ *
+ * In addition to trace records, this class manages the helper threads
+ * for dispatching a range of trace records to the tool.
+ */
+class OmptTracingBufferMgr {
+public:
+ /*
+ * A trace record (TR) holds the trace data. Its type
+ * can be ompt or native. Currently, only ompt type is implemented.
+ */
+
+ /*
+ * A TR can be in the following states:
+ * TR_init: initial state
+ * TR_ready: An OpenMP thread marks a TR ready when it is done
+ * populating the TR
+ * TR_released: A helper thread marks a TR released after it has
+ * completed returning the TR to the tool
+ */
+ enum TRStatus { TR_init, TR_ready, TR_released };
+
+ /*
+ * Metadata capturing the state of a buffer of trace records. Once a
+ * buffer is allocated by an OpenMP worker thread, trace records are
+ * carved out from that buffer by that same OpenMP thread alone. Thus
+ * the allocated buffer is thread-specific from the allocation/population
+ * standpoint. But it may be manipulated by helper threads.
+ *
+ * Id, DeviceId, Start, and TotalBytes are not changed once set.
+ * RemainingBytes could be written multiple times but only by the same
+ * thread. But Cursor and IsFull may be read/written by an OpenMP worker
+ * thread and read by helper threads. Hence, accesses of
+ * this 2nd set of locations need to be atomic or synchronized.
+ */
+ struct Buffer {
+ uint64_t Id; // Unique identifier of the buffer
+ int64_t DeviceId; // Device for which this buffer is allocated
+ void *Start; // Start of allocated space for trace records
+ size_t TotalBytes; // Total number of bytes in the allocated space
+ size_t RemainingBytes; // Total number of unused bytes
+ // corresponding to Cursor
+ std::atomic<void *> Cursor; // Address of the last trace record carved out
+ std::atomic<bool> IsFull; // true if no more trace records can be
+ // accomodated, otherwise false
+ Buffer(uint64_t BufId, int64_t DevId, void *S, size_t Bytes, size_t Rem,
+ void *C, bool F)
+ : Id(BufId), DeviceId(DevId), Start(S), TotalBytes(Bytes),
+ RemainingBytes(Rem), Cursor(C), IsFull(F) {}
+ Buffer() = delete;
+ Buffer(const Buffer &) = delete;
+ Buffer &operator=(const Buffer &) = delete;
+ };
+ using BufPtr = std::shared_ptr<Buffer>;
+
+private:
+ /// Envar to control whether a buffer should be flushed when it gets full.
+ BoolEnvar OMPX_FlushOnBufferFull;
+
+ /// Envar to control whether all buffers should be flushed during shutdown.
+ BoolEnvar OMPX_FlushOnShutdown;
+
+ // Internal variable for tracking threads to wait for flush
+ uint32_t ThreadFlushTracker;
+
+ // Internal variable for tracking threads shutting down
+ uint32_t ThreadShutdownTracker;
+
+ using MapId2Buf = std::map<uint64_t, BufPtr>;
+
+ // Map from id to corresponding buffer. The ids are assigned in
+ // increasing order of creation.
+ MapId2Buf Id2BufferMap;
+
+ // Trace record. We currently support OMPT data type only. The state
+ // (TRStatus type) is maintained inline in the trace record. The
+ // tool is expected to access only the OMPT record.
+ struct TraceRecord {
+ ompt_record_ompt_t TR;
+ std::atomic<TRStatus> TRState;
+ };
+
+ // Thread-specific array of pointers to a buffer. The buffer pointed to
+ // is the last one allocated by this thread for a given device. The ith
+ // element points to the buffer for the ith device. At most MAX_NUM_DEVICES
+ // devices are supported.
+ static thread_local BufPtr ArrayOfBufPtr[MAX_NUM_DEVICES];
+
+ /*
+ * A buffer is flushed when it fills up or when the tool invokes
+ * flush_trace. So it's possible that the same buffer may be flushed
+ * more than once. When a buffer is flushed the first time, a unique
+ * id (flush-id) is generated and assigned to that buffer. Even if
+ * it is flushed again, the previously assigned id is maintained for
+ * that buffer. This id is loosely used to determine the order in
+ * which the buffers are processed and the corresponding trace
+ * records released to the tool.
+ */
+
+ struct FlushInfo {
+ uint64_t FlushId;
+ void *FlushCursor;
+ BufPtr FlushBuf;
+ FlushInfo() = default;
+ FlushInfo(uint64_t Id, void *CR, BufPtr Buf)
+ : FlushId{Id}, FlushCursor{CR}, FlushBuf{Buf} {}
+ };
+
+ /*
+ * A buffer may be in the following states:
+ * Flush_waiting: when a buffer is flushed, either because it is
+ * full or because the tool invokes ompt_flush_trace
+ * Flush_processing: when a helper thread claims the waiting buffer
+ * and is in the process of dispatching buffer-completion callbacks
+ * on an associated range of trace records. If all trace records are
+ * not released, the state may be reset to Flush_waiting after the
+ * buffer-completion callbacks return
+ */
+ enum BufferFlushStatus { Flush_waiting, Flush_processing };
+ struct FlushMd {
+ void *FlushCursor;
+ BufPtr FlushBuf;
+ BufferFlushStatus FlushStatus;
+ FlushMd(void *CR, BufPtr Buf, BufferFlushStatus Status)
+ : FlushCursor{CR}, FlushBuf{Buf}, FlushStatus{Status} {}
+ FlushMd() = delete;
+ };
+
+ using MapId2Md = std::map<uint64_t, FlushMd>;
+
+ /*
+ * A map from a flush-id to metadata containing the current
+ * cursor. the corresponding buffer, and its flushed status. If a
+ * buffer is flushed multiple times, the cursor is updated to the
+ * furthest one
+ */
+ MapId2Md Id2FlushMdMap;
+
+ using UMapBufPtr2Id = std::unordered_map<BufPtr, uint64_t>;
+
+ // A hash map from a buffer address to the corresponding flush-id
+ UMapBufPtr2Id FlushBufPtr2IdMap;
+
+ using USetCursor = std::unordered_set<void *>;
+
+ USetCursor LastCursors;
+
+ using UMapThd2Id = std::unordered_map<std::thread::id, uint32_t>;
+
+ // A hash map from a helper thread id to an integer
+ UMapThd2Id HelperThreadIdMap;
+
+ // Mutex to protect Id2BufferMap and Cursor2BufMdMap
+ std::mutex BufferMgrMutex;
+
+ // Mutex to protect FlushBufPtr2IdMap and Id2FlushMdMap
+ std::mutex FlushMutex;
+
+ // Mutex to protect metadata tracking last cursors of buffer-completion
+ // callbacks
+ std::mutex LastCursorMutex;
+
+ // Condition variable used by helper thread to signal that flush is requested
+ std::condition_variable FlushCv;
+
+ // Condition variable used while waiting for flushing to complete
+ std::condition_variable ThreadFlushCv;
+
+ // Condition variable used while waiting for threads to shutdown
+ std::condition_variable ThreadShutdownCv;
+
+ // TODO Separate out the helper thread into its own class
+ std::vector<std::thread> CompletionThreads;
+
+ /// Called when a buffer \p Buf may be flushed with \p Cursor as the
+ /// last allocated trace record in the buffer.
+ /// triggerFlushOnBufferFull should be called without holding any lock.
+ void triggerFlushOnBufferFull(void *Cursor, BufPtr Buf);
+
+ // Called to dispatch buffer-completion callbacks for the trace records in
+ // this buffer
+ void flushBuffer(FlushInfo);
+
+ // Dispatch a buffer-completion callback with a range of trace records
+ void dispatchCallback(int64_t DeviceId, void *Buffer, void *FirstCursor,
+ void *LastCursor);
+
+ // Add a last cursor
+ void addLastCursor(void *Cursor) {
+ std::unique_lock<std::mutex> Lock(LastCursorMutex);
+ LastCursors.emplace(Cursor);
+ }
+
+ // Remove a last cursor
+ void removeLastCursor(void *Cursor) {
+ std::unique_lock<std::mutex> Lock(LastCursorMutex);
+ assert(LastCursors.find(Cursor) != LastCursors.end());
+ LastCursors.erase(Cursor);
+ }
+
+ // Given a trace record pointer, initialize its metadata
+ void initTraceRecordMetaData(void *Rec);
+
+ // Given a device-id, get/set a pointer to the last allocated buffer metadata.
+ BufPtr getDeviceSpecificBuffer(int64_t DevId);
+ void setDeviceSpecificBuffer(int64_t DevId, BufPtr Buf);
+
+ // Reserve a candidate buffer for flushing, preventing other helper threads
+ // from accessing it
+ FlushInfo findAndReserveFlushedBuf(uint64_t FlushId);
+
+ // Unreserve a buffer so that other helper threads can process it
+ void unreserveFlushedBuf(const FlushInfo &);
+
+ // All done with this buffer, so the buffer and its metadata can be removed
+ void destroyFlushedBuf(const FlushInfo &);
+
+ // Add a new buffer by an OpenMP thread so that a helper thread can process it
+ uint64_t addNewFlushEntry(BufPtr Buf, void *Cursor);
+
+ // Get the next trace record
+ void *getNextTR(void *TR);
+
+ // Given a buffer, return the latest cursor
+ void *getBufferCursor(BufPtr);
+
+ // Is no more space remaining for trace records in this buffer?
+ bool isBufferFull(const FlushInfo &);
+
+ // Have all trace records in this buffer been returned to the tool?
+ bool isBufferOwned(const FlushInfo &);
+
+ // Dispatch a buffer-completion callback and indicate that the buffer can be
+ // deallocated
+ void dispatchBufferOwnedCallback(const FlushInfo &);
+
+ // Main entry point for a helper thread
+ void driveCompletion();
+
+ // Examine the flushed buffers and dispatch buffer-completion callbacks
+ void invokeCallbacks();
+
+ // The caller does not hold a lock while calling this method
+ void waitForFlushCompletion();
+
+ // Given a thread number, set the corresponding bit in the flush
+ // tracker. The caller must hold the flush lock.
+ void setThreadFlush(uint32_t ThreadNum) {
+ ThreadFlushTracker |= (1 << ThreadNum);
+ }
+
+ // Reset this thread's flush bit. The caller must hold the flush lock
+ void resetThisThreadFlush() {
+ std::thread::id ID = std::this_thread::get_id();
+ assert(HelperThreadIdMap.find(ID) != HelperThreadIdMap.end());
+ ThreadFlushTracker &= ~(1 << HelperThreadIdMap[ID]);
+ }
+
+ // Given a thread number, set the corresponding bit in the shutdown
+ // tracker. The caller must hold the flush lock.
+ void setThreadShutdown(uint32_t ThreadNum) {
+ ThreadShutdownTracker |= (1 << ThreadNum);
+ }
+
+ // Reset this thread's shutdown bit. The caller must hold the flush
+ // lock
+ void resetThisThreadShutdown() {
+ std::thread::id ID = std::this_thread::get_id();
+ assert(HelperThreadIdMap.find(ID) != HelperThreadIdMap.end());
+ ThreadShutdownTracker &= ~(1 << HelperThreadIdMap[ID]);
+ }
+
+ // Return true if this thread's flush bit is set. The caller must
+ // hold the flush lock
+ bool isThisThreadFlushWaitedUpon() {
+ std::thread::id ID = std::this_thread::get_id();
+ assert(HelperThreadIdMap.find(ID) != HelperThreadIdMap.end());
+ return (ThreadFlushTracker & (1 << HelperThreadIdMap[ID])) != 0;
+ }
+
+ // Return true if this thread's shutdown bit is set. The caller must
+ // hold the flush lock
+ bool isThisThreadShutdownWaitedUpon() {
+ std::thread::id ID = std::this_thread::get_id();
+ assert(HelperThreadIdMap.find(ID) != HelperThreadIdMap.end());
+ return (ThreadShutdownTracker & (1 << HelperThreadIdMap[ID])) != 0;
+ }
+
+ // The caller must not hold the flush lock
+ bool amIHelperThread() {
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ if (HelperThreadIdMap.find(std::this_thread::get_id()) !=
+ HelperThreadIdMap.end())
+ return true;
+ return false;
+ }
+
+ // The caller must not hold the flush lock
+ bool areHelperThreadsAvailable();
+
+ // The caller must hold the appropriate lock
+ void init();
+
+ // The caller must hold the flush lock
+ void createHelperThreads();
+
+ // The caller must hold the flush lock
+ void destroyHelperThreads();
+
+public:
+ OmptTracingBufferMgr()
+ : OMPX_FlushOnBufferFull("LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL", true),
+ OMPX_FlushOnShutdown("LIBOMPTARGET_OMPT_FLUSH_ON_SHUTDOWN", true) {
+ // no need to hold locks for init() since object is getting constructed
+ // here.
+ init();
+ }
+
+ OmptTracingBufferMgr(const OmptTracingBufferMgr &) = delete;
+ OmptTracingBufferMgr &operator=(const OmptTracingBufferMgr &) = delete;
+
+ // The caller must not hold the flush lock
+ void startHelperThreads();
+
+ // The caller must not hold the flush lock. The helper threads are shut down
+ // without flushing any outstanding trace records.
+ void shutdownHelperThreads();
+
+ // The caller must not hold the flush lock. The helper threads are shut down
+ // after flushing all outstanding trace records for all devices.
+ void flushAndShutdownHelperThreads();
+
+ // Assign a cursor for a new trace record. This will assign a trace record
+ // for the provided device-id, allocating a new buffer if required.
+ void *assignCursor(ompt_callbacks_t Type, int64_t DeviceId);
+
+ // Get the size of a trace record
+ size_t getTRSize() { return sizeof(TraceRecord); }
+
+ // Get the status of a trace record. This function does not acquire
+ // a lock. If locking is required, the caller must hold a lock.
+ TRStatus getTRStatus(void *Rec);
+
+ // Set the status of a trace record. This function does not acquire
+ // a lock. If locking is required, the caller must hold a lock.
+ void setTRStatus(void *Rec, TRStatus);
+
+ // Is this a last cursor of a buffer completion callback?
+ bool isLastCursor(void *Cursor) {
+ std::unique_lock<std::mutex> Lock(LastCursorMutex);
+ return LastCursors.find(Cursor) != LastCursors.end();
+ }
+
+ // Called for flushing outstanding buffers for the provided device-id.
+ int flushAllBuffers(int DeviceId);
+};
+
+#else
+class OmptTracingBufferMgr {};
+#endif // OMPT_SUPPORT
+
+#endif // OPENMP_LIBOMPTARGET_OMPTTRACINGBUFFER_H
diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h
index 6c6fdebe76dff..972afd962cbc1 100644
--- a/offload/include/PluginManager.h
+++ b/offload/include/PluginManager.h
@@ -13,6 +13,7 @@
#ifndef OMPTARGET_PLUGIN_MANAGER_H
#define OMPTARGET_PLUGIN_MANAGER_H
+#include "OpenMP/OMPT/OmptTracingBuffer.h"
#include "PluginInterface.h"
#include "DeviceImage.h"
@@ -48,7 +49,7 @@ struct PluginManager {
/// Exclusive accessor type for the device container.
using ExclusiveDevicesAccessorTy = Accessor<DeviceContainerTy>;
- PluginManager() {}
+ PluginManager() : TraceRecordManager(nullptr) {}
void init();
@@ -150,6 +151,11 @@ struct PluginManager {
return count;
}
+ auto getTraceRecordManager() const {
+ assert(TraceRecordManager && "Trace record manager not initialized");
+ return TraceRecordManager;
+ }
+
private:
bool RTLsLoaded = false;
llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc;
@@ -176,6 +182,8 @@ struct PluginManager {
/// Devices associated with plugins, accesses to the container are exclusive.
ProtectedObj<DeviceContainerTy> Devices;
+ OmptTracingBufferMgr *TraceRecordManager;
+
/// References to upgraded legacy offloading entries.
std::list<llvm::SmallVector<llvm::offloading::EntryTy, 0>> LegacyEntries;
std::list<llvm::SmallVector<__tgt_device_image, 0>> LegacyImages;
diff --git a/offload/libomptarget/CMakeLists.txt b/offload/libomptarget/CMakeLists.txt
index 8e6314b2a6eae..33b32d21e3c69 100644
--- a/offload/libomptarget/CMakeLists.txt
+++ b/offload/libomptarget/CMakeLists.txt
@@ -17,6 +17,7 @@ add_library(omptarget SHARED
OpenMP/Mapping.cpp
OpenMP/InteropAPI.cpp
OpenMP/OMPT/Callback.cpp
+ OpenMP/OMPT/OmptTracingBuffer.cpp
KernelLanguage/API.cpp
)
diff --git a/offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp b/offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp
new file mode 100644
index 0000000000000..2d4b32e3554ac
--- /dev/null
+++ b/offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp
@@ -0,0 +1,763 @@
+//=== OmptTracingBuffer.cpp - Target independent OpenMP target RTL -- C++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of OMPT device trace record generation and flushing.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef OMPT_SUPPORT
+
+#include "OpenMP/OMPT/OmptTracingBuffer.h"
+#include "OpenMP/OMPT/OmptTracing.h"
+#include "Shared/Debug.h"
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <limits>
+
+using namespace llvm::omp::target::debug;
+
+// When set to true, helper threads terminate their work
+static bool DoneTracing{false};
+
+// Unique buffer id in creation order
+static std::atomic<uint64_t> BufId{0};
+
+// Unique id in buffer flush order
+static std::atomic<uint64_t> FlushId{0};
+
+thread_local OmptTracingBufferMgr::BufPtr
+ OmptTracingBufferMgr::ArrayOfBufPtr[MAX_NUM_DEVICES];
+
+static uint64_t get_and_inc_buf_id() { return BufId++; }
+
+static uint64_t get_and_inc_flush_id() { return FlushId++; }
+static uint64_t get_flush_id() { return FlushId; }
+
+/*
+ * Used by OpenMP threads for assigning space for a trace record. If
+ * there is no space in the last buffer allocated by this thread, the
+ * last buffer is marked full and scheduled for flushing. Otherwise,
+ * space is assigned for a trace record and the new cursor returned.
+ * Since the memory allocated by a thread is used by that thread alone
+ * for creating trace records, a lock need not be held. In the less
+ * common branch when memory is allocated, a lock needs to be acquired
+ * for updating shared metadata. The common path of allocating a trace
+ * record from an existing buffer proceeds without locking.
+ */
+void *OmptTracingBufferMgr::assignCursor(ompt_callbacks_t Type,
+ int64_t DeviceId) {
+ // The caller should handle nullptr by not tracing for this event.
+ if (DeviceId < 0 || DeviceId > MAX_NUM_DEVICES - 1)
+ return nullptr;
+
+ size_t RecSize = getTRSize();
+
+ // If the buffer fills up, it will be scheduled for flushing with the
+ // following cursor.
+ void *ToBeFlushedCursor = nullptr;
+ BufPtr ToBeFlushedBuf = nullptr;
+
+ // Thread local buffer pointer should be non-null once an allocation
+ // has been done by this thread.
+ BufPtr DeviceBuf = getDeviceSpecificBuffer(DeviceId);
+ if (DeviceBuf != nullptr) {
+ assert(DeviceBuf->DeviceId == DeviceId && "Unexpected device id in buffer");
+ void *OldCursor = DeviceBuf->Cursor.load(std::memory_order_acquire);
+ // Try to assign a trace record from the last allocated buffer
+ if (RecSize <= DeviceBuf->RemainingBytes) {
+ assert((char *)DeviceBuf->Start + DeviceBuf->TotalBytes -
+ DeviceBuf->RemainingBytes ==
+ (char *)OldCursor + RecSize);
+ DeviceBuf->RemainingBytes -= RecSize;
+
+ // Note the trace record status must be initialized before setting
+ // the cursor, ensuring that a helper thread always sees an initialized
+ // trace record status.
+ void *NewCursor = (char *)OldCursor + RecSize;
+ initTraceRecordMetaData(NewCursor);
+ DeviceBuf->Cursor.store(NewCursor, std::memory_order_release);
+
+ ODBG(ODT_Tool) << "Thread " << llvm::omp::target::ompt::getThreadId()
+ << ": Assigned " << RecSize << " bytes at " << NewCursor
+ << " in existing buffer " << DeviceBuf->Start
+ << " for device " << DeviceId;
+ return NewCursor;
+ } else {
+ ToBeFlushedCursor = OldCursor;
+ ToBeFlushedBuf = DeviceBuf;
+
+ // Mark that no space is present for any more trace records.
+ // The following is atomic but there is no logical order between when
+ // it is set here and when it is checked by a helper thread. That works
+ // because the helper thread uses this info to decide whether a buffer
+ // can be scheduled for removal. In the worst case, the buffer will be
+ // removed late.
+ DeviceBuf->IsFull.store(true, std::memory_order_release);
+ }
+ }
+ void *NewBuffer = nullptr;
+ size_t TotalBytes = 0;
+ // TODO Move the buffer allocation to a helper thread
+ llvm::omp::target::ompt::ompt_callback_buffer_request(DeviceId, &NewBuffer,
+ &TotalBytes);
+
+ // The caller should handle nullptr by not tracing for this event.
+ if (NewBuffer == nullptr || TotalBytes < RecSize)
+ return nullptr;
+
+ uint64_t NewBufId = get_and_inc_buf_id();
+ auto new_buf = std::make_shared<Buffer>(
+ NewBufId, DeviceId, /*Start=*/NewBuffer, TotalBytes,
+ /*RemainingBytes=*/TotalBytes - RecSize,
+ /*Cursor=*/NewBuffer,
+ /*IsFull=*/false);
+
+ // Initialize trace record status before publishing it to helper threads.
+ initTraceRecordMetaData(new_buf->Cursor.load(std::memory_order_acquire));
+ setDeviceSpecificBuffer(DeviceId, new_buf);
+
+ // Make this trace record visible to helper threads by adding to shared
+ // metadata.
+ std::unique_lock<std::mutex> lck(BufferMgrMutex);
+ assert(Id2BufferMap.find(NewBufId) == Id2BufferMap.end());
+ Id2BufferMap[NewBufId] = new_buf;
+ lck.unlock();
+
+ // Schedule the full buffer for flushing till the corresponding cursor.
+ if (OMPX_FlushOnBufferFull && ToBeFlushedCursor)
+ triggerFlushOnBufferFull(ToBeFlushedCursor, ToBeFlushedBuf);
+
+ ODBG(ODT_Tool) << "Thread " << llvm::omp::target::ompt::getThreadId()
+ << ": Assigned " << RecSize << " bytes at " << NewBuffer
+ << " in new buffer with id " << NewBufId << " for device "
+ << DeviceId;
+ return NewBuffer;
+}
+
+/*
+ * Called by an OpenMP thread when a buffer fills up and should be
+ * flushed. This function assigns a new FlushId to the buffer, adds
+ * to the flush-related metadata and wakes up a helper thread to
+ * dispatch a buffer-completion callback. This function should be
+ * called without holding any lock.
+ * Note lock order: buf_lock -> flush_lock
+ */
+void OmptTracingBufferMgr::triggerFlushOnBufferFull(void *cursor, BufPtr Buf) {
+ std::unique_lock<std::mutex> buf_lock(BufferMgrMutex);
+
+ // Between calling this function and this check, a flush-all may have
+ // delivered this buffer to the tool and deleted it. So the buffer
+ // may not exist.
+ if (Id2BufferMap.find(Buf->Id) == Id2BufferMap.end())
+ return;
+
+ // Cannot assert that the state of the cursor is ready since a
+ // different thread may be in the process of populating it. If it
+ // remains in init state when the range of trace records is
+ // determined for dispatching the buffer-completion callback, it
+ // will not be included.
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ uint64_t flush_id;
+ auto flush_itr = FlushBufPtr2IdMap.find(Buf);
+ if (flush_itr == FlushBufPtr2IdMap.end()) {
+ // This buffer has not been flushed yet
+ addNewFlushEntry(Buf, cursor);
+ } else {
+ // This buffer has been flushed before
+ flush_id = flush_itr->second;
+ auto flush_md_itr = Id2FlushMdMap.find(flush_id);
+ assert(flush_md_itr != Id2FlushMdMap.end());
+ flush_md_itr->second.FlushCursor = cursor; // update the cursor
+ // Do not update the flush status since it may be under processing
+ // by another thread
+ ODBG(ODT_Tool) << "Updated id " << flush_id << " cursor " << cursor
+ << " buf " << flush_md_itr->second.FlushBuf->Start;
+ }
+ flush_lock.unlock();
+ buf_lock.unlock();
+
+ // Wake up a helper thread to invoke the buffer-completion callback
+ FlushCv.notify_one();
+}
+
+// This is the driver routine for the completion thread
+void OmptTracingBufferMgr::driveCompletion() {
+ while (true) {
+ bool should_signal_workers = false;
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ if (DoneTracing) {
+ // An upper layer serializes flush_trace and stop_trace. In
+ // addition, before DoneTracing is set, a flush is performed as
+ // part of stop_trace. So assert that no flush is in progress.
+ assert(ThreadFlushTracker == 0);
+ break;
+ }
+ FlushCv.wait(flush_lock, [this] {
+ return DoneTracing ||
+ (!Id2FlushMdMap.empty() &&
+ llvm::omp::target::ompt::TracingActive) ||
+ isThisThreadFlushWaitedUpon();
+ });
+ if (isThisThreadFlushWaitedUpon()) {
+ resetThisThreadFlush();
+ if (ThreadFlushTracker == 0)
+ should_signal_workers = true;
+ }
+ flush_lock.unlock();
+
+ invokeCallbacks();
+
+ if (should_signal_workers)
+ ThreadFlushCv.notify_all();
+
+ // There is a scenario where a buffer was processed but not full
+ // or owned, so it was put back in waiting state. So this thread
+ // would not wait but keep on looping without having any actual
+ // work until new trace records are added and this thread
+ // signaled. Hence, this thread yields.
+ std::this_thread::yield();
+ }
+ bool is_last_helper = false;
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ assert(DoneTracing && "Helper thread exiting but not yet done");
+ assert(isThisThreadShutdownWaitedUpon() &&
+ "Helper thread exiting but not waited upon");
+ resetThisThreadShutdown();
+ if (ThreadShutdownTracker == 0)
+ is_last_helper = true;
+ flush_lock.unlock();
+ if (is_last_helper)
+ ThreadShutdownCv.notify_all();
+
+ // Note that some trace records may have been written but not
+ // delivered to the tool. If flush/stop APIs are not called by the
+ // tool, those trace records may never be delivered to the tool and
+ // the corresponding buffers not reclaimed. TODO Explore whether
+ // this cleanup must be done.
+}
+
+/*
+ * Called by a buffer-completion helper thread. This function examines
+ * the flushed buffers in flush order and dispatches
+ * callbacks. Lock holding is minimized by reserving a buffer,
+ * processing it, and then unreserving it if there are more trace
+ * records to flush later. If all trace records are flushed, a
+ * callback is dispatched informing the tool that the buffer can be
+ * deallocated. If the buffer can be deallocated, all metadata is
+ * destroyed.
+ * Note that this function must be called without holding any locks.
+ */
+void OmptTracingBufferMgr::invokeCallbacks() {
+ ODBG(ODT_Tool) << "Looking for callbacks to invoke";
+ auto max_id = std::numeric_limits<uint64_t>::max();
+ auto curr_id = max_id;
+ auto end_id = get_flush_id();
+ ODBG(ODT_Tool) << "End id is " << end_id;
+ while (true) {
+ // Set the status of the flushed buffer to in-processing so that
+ // another helper thread does not process it concurrently. An
+ // OpenMP worker thread may, however, populate a trace record in a
+ // reserved buffer concurrently.
+ FlushInfo flush_info = findAndReserveFlushedBuf(curr_id);
+
+ // no entry found, nothing to process
+ if (curr_id == max_id && flush_info.FlushCursor == nullptr)
+ return;
+
+ if (flush_info.FlushCursor != nullptr) {
+ // increment curr_id to get the candidate for the next iteration
+ curr_id = flush_info.FlushId + 1;
+ } else {
+ assert(curr_id != max_id && "Cannot increment max id");
+ ++curr_id;
+ }
+
+ ODBG(ODT_Tool) << "Next id will be " << curr_id;
+
+ if (flush_info.FlushCursor == nullptr) {
+ // This buffer must have been processed already
+ if (curr_id < end_id)
+ continue;
+ else
+ return; // nothing else to process
+ }
+
+ ODBG(ODT_Tool) << "Buf " << flush_info.FlushBuf->Start
+ << " Cursor " << flush_info.FlushCursor
+ << " Id " << flush_info.FlushId
+ << " will be flushed";
+
+ // Examine the status of the trace records and dispatch
+ // buffer-completion callbacks as appropriate.
+ flushBuffer(flush_info);
+
+ // TODO optimize to set buffer-owned in the same pass above.
+ // Currently, this is the only way a buffer is deallocated
+ if (isBufferFull(flush_info)) {
+ // All trace records have been delivered to the tool
+ if (isBufferOwned(flush_info)) {
+ // erase element from buffer and flush maps
+ destroyFlushedBuf(flush_info);
+
+ // dispatch callback with a null range and have the tool
+ // deallocate the buffer
+ dispatchBufferOwnedCallback(flush_info);
+ } else {
+ unreserveFlushedBuf(flush_info);
+ }
+ } else {
+ unreserveFlushedBuf(flush_info);
+ }
+ if (curr_id >= end_id)
+ return;
+ }
+}
+
+/*
+ * This function is called on a buffer that is already reserved by
+ * this thread. Buffer-completion callbacks are dispatched for every
+ * range of trace records that are ready.
+ * This routine must be called without holding locks
+ */
+void OmptTracingBufferMgr::flushBuffer(FlushInfo flush_info) {
+ assert(flush_info.FlushBuf && "Cannot flush an empty buffer");
+ assert(flush_info.FlushCursor && "Cannot flush upto a null cursor");
+
+ void *curr_tr = flush_info.FlushBuf->Start;
+ void *last_tr = flush_info.FlushCursor;
+ // Compute a range [first_cursor,last_cursor] to flush
+ void *first_cursor = nullptr;
+ void *last_cursor = nullptr;
+ while (curr_tr <= last_tr) {
+ TRStatus tr_status = getTRStatus(curr_tr);
+ if (tr_status == TR_init || tr_status == TR_released) {
+ if (first_cursor == nullptr) {
+ // This TR won't be part of a range
+ assert(last_cursor == nullptr &&
+ "Begin/last cursors mutually inconsistent");
+ } else {
+ // End the current interval
+ dispatchCallback(flush_info.FlushBuf->DeviceId,
+ flush_info.FlushBuf->Start, first_cursor, last_cursor);
+ first_cursor = last_cursor = nullptr;
+ }
+ } else {
+ assert(tr_status == TR_ready && "Unknown trace record status");
+ setTRStatus(curr_tr, TR_released);
+ if (first_cursor == nullptr)
+ first_cursor = curr_tr;
+ last_cursor = curr_tr;
+ }
+ curr_tr = getNextTR(curr_tr);
+ }
+ if (first_cursor != nullptr) {
+ assert(last_cursor != nullptr);
+ dispatchCallback(flush_info.FlushBuf->DeviceId, flush_info.FlushBuf->Start,
+ first_cursor, last_cursor);
+ }
+}
+
+// Given a range of trace records, dispatch a buffer-completion callback
+void OmptTracingBufferMgr::dispatchCallback(int64_t DeviceId, void *Buffer,
+ void *FirstCursor,
+ void *LastCursor) {
+ assert(FirstCursor != nullptr && LastCursor != nullptr &&
+ "Callback with nullptr");
+ addLastCursor(LastCursor);
+
+ // This is best effort.
+ // There is a small window when the buffer-completion callback may
+ // be invoked even after tracing has been disabled.
+ // Note that we don't want to hold a lock when dispatching the callback.
+ if (llvm::omp::target::ompt::isTracedDevice(DeviceId)) {
+ ODBG(ODT_Tool) << "Dispatch callback w/ range (inclusive) to be flushed: "
+ << FirstCursor << " -> " << LastCursor;
+ llvm::omp::target::ompt::ompt_callback_buffer_complete(
+ DeviceId, Buffer,
+ /* bytes returned in this callback */
+ (char *)getNextTR(LastCursor) - (char *)FirstCursor,
+ (ompt_buffer_cursor_t)FirstCursor, false /* buffer_owned */);
+ }
+
+ removeLastCursor(LastCursor);
+}
+
+// Dispatch a buffer-completion callback with buffer_owned set so that
+// the tool can deallocate the buffer
+void OmptTracingBufferMgr::dispatchBufferOwnedCallback(
+ const FlushInfo &flush_info) {
+ // This is best effort.
+ // There is a small window when the buffer-completion callback may
+ // be invoked even after tracing has been disabled.
+ // Note that we don't want to hold a lock when dispatching the callback.
+ if (llvm::omp::target::ompt::isTracedDevice(flush_info.FlushBuf->DeviceId)) {
+ ODBG(ODT_Tool) << "Dispatch callback with buffer "
+ << flush_info.FlushBuf->Start << " owned";
+ llvm::omp::target::ompt::ompt_callback_buffer_complete(
+ flush_info.FlushBuf->DeviceId, flush_info.FlushBuf->Start, 0,
+ (ompt_buffer_cursor_t)0, true /* buffer owned */);
+ }
+}
+
+void OmptTracingBufferMgr::initTraceRecordMetaData(void *Rec) {
+ setTRStatus(Rec, TR_init);
+}
+
+OmptTracingBufferMgr::BufPtr
+OmptTracingBufferMgr::getDeviceSpecificBuffer(int64_t DeviceId) {
+ if (DeviceId < 0 || DeviceId > MAX_NUM_DEVICES - 1) {
+ REPORT() << "getDeviceSpecificBuffer: Device id " << DeviceId
+ << " invalid or exceeds supported max: "
+ << MAX_NUM_DEVICES - 1;
+ return nullptr;
+ }
+ return ArrayOfBufPtr[DeviceId];
+}
+
+void OmptTracingBufferMgr::setDeviceSpecificBuffer(int64_t DeviceId,
+ BufPtr Buf) {
+ if (DeviceId < 0 || DeviceId > MAX_NUM_DEVICES - 1) {
+ REPORT() << "setDeviceSpecificBuffer: Device id " << DeviceId
+ << " invalid or exceeds supported max: "
+ << MAX_NUM_DEVICES - 1;
+ return;
+ }
+ ArrayOfBufPtr[DeviceId] = Buf;
+}
+
+void OmptTracingBufferMgr::setTRStatus(void *Rec, TRStatus Status) {
+ TraceRecord *TR = static_cast<TraceRecord *>(Rec);
+ TR->TRState.store(Status, std::memory_order_release);
+}
+
+OmptTracingBufferMgr::TRStatus OmptTracingBufferMgr::getTRStatus(void *Rec) {
+ return static_cast<TraceRecord *>(Rec)->TRState.load(
+ std::memory_order_acquire);
+}
+
+void *OmptTracingBufferMgr::getNextTR(void *TR) {
+ size_t RecSize = getTRSize();
+ // warning: no overflow check done
+ return (char *)TR + RecSize;
+}
+
+bool OmptTracingBufferMgr::isBufferFull(const FlushInfo &flush_info) {
+ std::unique_lock<std::mutex> buf_lock(BufferMgrMutex);
+ return flush_info.FlushBuf->IsFull;
+}
+
+void *OmptTracingBufferMgr::getBufferCursor(BufPtr buf) {
+ return buf->Cursor.load(std::memory_order_acquire);
+}
+
+/*
+ * Traverse all the trace records of a buffer and return true if all
+ * of them have been released to the tool, otherwise return false
+ */
+bool OmptTracingBufferMgr::isBufferOwned(const FlushInfo &flush_info) {
+ assert(isBufferFull(flush_info) && "Compute buffer-owned when it is full");
+ void *curr_tr = flush_info.FlushBuf->Start;
+ // Since the buffer is full, the cursor must be the last valid
+ // TR. Note that this may be more up-to-date than the cursor in the
+ // flush_info. Use the last valid TR to avoid dropping trace records
+ void *last_tr = getBufferCursor(flush_info.FlushBuf);
+ while (curr_tr <= last_tr) {
+ if (getTRStatus(curr_tr) != TR_released)
+ return false;
+ curr_tr = getNextTR(curr_tr);
+ }
+ return true;
+}
+
+/*
+ * A buffer must be reserved by a thread before it can be processed
+ * and callbacks dispatched for that buffer. Reservation is done by
+ * setting the status to in-processing.
+ *
+ * If a buffer is found in the flush metadata for the given id and it
+ * is not in in-processing mode, reserve it by setting its mode to
+ * in-processing and return the corresponding flush metadata. If the
+ * given id is set to max, return the first waiting buffer in the
+ * list of buffers to be flushed.
+ */
+OmptTracingBufferMgr::FlushInfo
+OmptTracingBufferMgr::findAndReserveFlushedBuf(uint64_t FlushId) {
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ MapId2Md::iterator flush_itr;
+ if (FlushId == std::numeric_limits<uint64_t>::max()) {
+ // Reserve the first waiting buffer and return it
+ if (Id2FlushMdMap.empty())
+ return FlushInfo();
+ for (flush_itr = Id2FlushMdMap.begin(); flush_itr != Id2FlushMdMap.end();
+ ++flush_itr) {
+ // Reserve only if waiting
+ if (flush_itr->second.FlushStatus == Flush_waiting)
+ break;
+ }
+ if (flush_itr == Id2FlushMdMap.end())
+ return FlushInfo();
+ } else {
+ flush_itr = Id2FlushMdMap.find(FlushId);
+ if (flush_itr == Id2FlushMdMap.end() ||
+ flush_itr->second.FlushStatus == Flush_processing)
+ return FlushInfo();
+ }
+ assert(flush_itr->second.FlushStatus == Flush_waiting);
+ flush_itr->second.FlushStatus = Flush_processing;
+ // Update the metadata cursor since more trace records may have been
+ // generated.
+ flush_itr->second.FlushCursor =
+ flush_itr->second.FlushBuf->Cursor.load(std::memory_order_acquire);
+
+ FlushInfo flush_info(flush_itr->first, flush_itr->second.FlushCursor,
+ flush_itr->second.FlushBuf);
+ ODBG(ODT_Tool) << "Reserved buffer: flush_id:" << flush_itr->first
+ << ", cursor:" << flush_itr->second.FlushCursor
+ << ", buf:" << flush_itr->second.FlushBuf->Start;
+ return flush_info;
+}
+
+/*
+ * Given a buffer, verify that it is in processing state and set its
+ * status to waiting, removing the reservation. The same thread that
+ * reserved it should be unreserving it but currently there is no such
+ * check.
+ */
+void OmptTracingBufferMgr::unreserveFlushedBuf(const FlushInfo &flush_info) {
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ auto itr = Id2FlushMdMap.find(flush_info.FlushId);
+ assert(itr != Id2FlushMdMap.end() &&
+ itr->second.FlushStatus == Flush_processing);
+ itr->second.FlushStatus = Flush_waiting;
+ ODBG(ODT_Tool) << "Unreserved buffer: flush_id:" << flush_info.FlushId
+ << ", cursor:" << flush_info.FlushCursor
+ << ", buf:" << flush_info.FlushBuf->Start;
+}
+
+/*
+ * This function must be called after all of the trace records in the
+ * buffer have been released to the tool. The buffer is removed from
+ * all metadata maps.
+ * Note lock order: buf_lock -> flush_lock
+ */
+void OmptTracingBufferMgr::destroyFlushedBuf(const FlushInfo &flush_info) {
+ ODBG(ODT_Tool) << "Destroying buffer: flush_id:" << flush_info.FlushId
+ << ", cursor:" << flush_info.FlushCursor
+ << ", buf:" << flush_info.FlushBuf->Start;
+
+ BufPtr buf = flush_info.FlushBuf;
+
+ std::unique_lock<std::mutex> buf_lock(BufferMgrMutex);
+ Id2BufferMap.erase(buf->Id);
+
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ auto flush_itr = Id2FlushMdMap.find(flush_info.FlushId);
+ assert(flush_itr != Id2FlushMdMap.end());
+ assert(flush_itr->second.FlushBuf == buf);
+ Id2FlushMdMap.erase(flush_itr);
+ FlushBufPtr2IdMap.erase(buf);
+}
+
+/*
+ * Generate a new flush id and add the buffer to the flush metadata
+ * maps. This function must be called while holding the flush lock.
+ */
+uint64_t OmptTracingBufferMgr::addNewFlushEntry(BufPtr Buf, void *Cursor) {
+ assert(FlushBufPtr2IdMap.find(Buf) == FlushBufPtr2IdMap.end());
+ uint64_t FlushId = get_and_inc_flush_id();
+ FlushBufPtr2IdMap.emplace(Buf, FlushId);
+ assert(Id2FlushMdMap.find(FlushId) == Id2FlushMdMap.end());
+ Id2FlushMdMap.emplace(FlushId, FlushMd(Cursor, Buf, Flush_waiting));
+
+ ODBG(ODT_Tool) << "Added new flush id "
+ << FlushId << " cursor "
+ << Cursor << " buf " << Buf->Start;
+
+ return FlushId;
+}
+
+/*
+ * Called by ompt_flush_trace and ompt_stop_trace. Traverse the
+ * existing buffers in creation order and flush all the ready TRs
+ */
+int OmptTracingBufferMgr::flushAllBuffers(int DeviceId) {
+ ODBG(ODT_Tool) << "Flushing buffers for device " << DeviceId << " :: START";
+ // Overloading MAX_NUM_DEVICES to mean all devices.
+ if (DeviceId < 0 || DeviceId > MAX_NUM_DEVICES)
+ return 0; // failed to flush
+
+ if (!areHelperThreadsAvailable())
+ return 0; // failed to flush
+
+ // If flush is called from a helper thread, just bail out
+ if (amIHelperThread())
+ return 0; // failed to flush
+
+ // To avoid holding the mutex for too long, get the ids of the first
+ // and the last TRs under lock, and then go through that range,
+ // holding the mutex for an individual TR
+ std::unique_lock<std::mutex> buf_lock(BufferMgrMutex);
+ if (Id2BufferMap.empty())
+ return 1; // no trace records to flush
+ uint64_t curr_buf_id = Id2BufferMap.begin()->first;
+ uint64_t last_buf_id = Id2BufferMap.rbegin()->first;
+ buf_lock.unlock();
+
+ while (curr_buf_id <= last_buf_id) {
+ std::unique_lock<std::mutex> buf_lock(BufferMgrMutex);
+ // Another thread may have deleted this buffer by now
+ auto buf_itr = Id2BufferMap.find(curr_buf_id);
+ if (buf_itr == Id2BufferMap.end()) {
+ ++curr_buf_id;
+ continue;
+ }
+ BufPtr curr_buf = buf_itr->second;
+
+ // If the device-id does not match, skip it. A device-id of MAX_NUM_DEVICES
+ // indicates flushing for all devices.
+ if (DeviceId != MAX_NUM_DEVICES && curr_buf->DeviceId != DeviceId) {
+ ++curr_buf_id;
+ continue;
+ }
+
+ // If this buffer is in the flush-map, skip it. It is either in
+ // process by another thread or will be processed
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ auto flush_itr = FlushBufPtr2IdMap.find(curr_buf);
+ if (flush_itr != FlushBufPtr2IdMap.end()) {
+ ++curr_buf_id;
+ continue;
+ }
+ // This buffer has not been flushed yet
+ void *CurrBufCursor = getBufferCursor(curr_buf);
+ uint64_t flush_id = addNewFlushEntry(curr_buf, CurrBufCursor);
+ (void)flush_id; // Silence warning.
+ ODBG(ODT_Tool) << "flushAllBuffers: Added new id "
+ << flush_id << " cursor " << CurrBufCursor
+ << " buf " << curr_buf->Start;
+
+ flush_lock.unlock();
+ buf_lock.unlock();
+
+ ++curr_buf_id;
+ }
+
+ ODBG(ODT_Tool) << "Flushing buffers for device " << DeviceId << " :: WAIT";
+
+ // This is best effort. It is possible that some trace records are
+ // not flushed when the wait is done.
+ waitForFlushCompletion();
+
+ ODBG(ODT_Tool) << "Flushing buffers for device " << DeviceId << " :: STOP";
+
+ return 1; // success
+}
+
+void OmptTracingBufferMgr::waitForFlushCompletion() {
+ {
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ // Setting the flush bit for a given helper thread indicates that the worker
+ // thread is ready for the helper thread to do some work.
+ for (uint32_t i = 0; i < OMPT_NUM_HELPER_THREADS; ++i)
+ setThreadFlush(i);
+ }
+
+ // Wake up all helper threads to invoke buffer-completion callbacks.
+ FlushCv.notify_all();
+
+ // Now wait for all helper threads to complete flushing.
+ {
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ ThreadFlushCv.wait(flush_lock, [this] { return ThreadFlushTracker == 0; });
+ }
+}
+
+void OmptTracingBufferMgr::init() {
+ for (int i = 0; i < MAX_NUM_DEVICES; ++i)
+ ArrayOfBufPtr[i] = nullptr;
+ ThreadFlushTracker = 0;
+ ThreadShutdownTracker = 0;
+ DoneTracing = false; // TODO make it a class member
+}
+
+void OmptTracingBufferMgr::startHelperThreads() {
+ // All helper threads are stopped while holding FlushMutex. So if
+ // any helper thread is present, just return. This takes care of
+ // repeated calls to start-trace.
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ if (!HelperThreadIdMap.empty()) {
+ assert(!DoneTracing && "Helper threads exist but tracing is done");
+ return;
+ }
+ init();
+ createHelperThreads();
+}
+
+bool OmptTracingBufferMgr::areHelperThreadsAvailable() {
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ if (DoneTracing // If another thread called stop, assume there are no threads
+ || HelperThreadIdMap.empty() // Threads were never started
+ ) {
+ // Don't assert on HelperThreadIdMap since shutdown by another
+ // thread may be in progress
+ return false;
+ }
+ return true;
+}
+
+void OmptTracingBufferMgr::shutdownHelperThreads() {
+ if (!areHelperThreadsAvailable())
+ return;
+
+ std::unique_lock<std::mutex> flush_lock(FlushMutex);
+ // If I am destroying the threads, then at least one thread must be present
+ assert(!CompletionThreads.empty());
+ assert(!HelperThreadIdMap.empty());
+ assert(ThreadShutdownTracker == 0);
+
+ // Set the done flag which helper threads will look at
+ DoneTracing = true;
+ // Wait to make sure all helper threads exit
+ for (uint32_t i = 0; i < OMPT_NUM_HELPER_THREADS; ++i)
+ setThreadShutdown(i);
+ // Signal indicating that DoneTracing is set
+ FlushCv.notify_all();
+ ThreadShutdownCv.wait(flush_lock,
+ [this] { return ThreadShutdownTracker == 0; });
+
+ // Now destroy all the helper threads
+ destroyHelperThreads();
+}
+
+void OmptTracingBufferMgr::flushAndShutdownHelperThreads() {
+ std::unique_lock<std::mutex> Lock(llvm::omp::target::ompt::TraceControlMutex);
+ // Flush buffers for all devices.
+ if (OMPX_FlushOnShutdown)
+ flushAllBuffers(MAX_NUM_DEVICES);
+ else
+ waitForFlushCompletion(); // Dont initiate but wait for outstanding flushes.
+ shutdownHelperThreads();
+}
+
+void OmptTracingBufferMgr::createHelperThreads() {
+ for (uint32_t i = 0; i < OMPT_NUM_HELPER_THREADS; ++i) {
+ CompletionThreads.emplace_back(
+ std::thread(&OmptTracingBufferMgr::driveCompletion, this));
+ HelperThreadIdMap[CompletionThreads.back().get_id()] = i;
+ }
+}
+
+void OmptTracingBufferMgr::destroyHelperThreads() {
+ for (auto &thd : CompletionThreads)
+ thd.join();
+ CompletionThreads.clear();
+ HelperThreadIdMap.clear();
+}
+#endif
>From e5ed98066fa6d230bd03ab24f1481f2ace4c1eb3 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 2 Apr 2026 07:52:40 -0500
Subject: [PATCH 07/15] [Offload][OMPT] Add tracing orchestration and
libomptarget integration
Wire the OMPT device tracing subsystem into the libomptarget runtime,
completing the end-to-end tracing pipeline from trace record production
through buffer management to tool delivery.
New files:
- OmptTracing.cpp (libomptarget-side): Full tracing orchestration with
C entry points (libomptarget_ompt_set_trace_ompt, start_trace,
flush_trace, stop_trace, advance_buffer_cursor, get_record_type,
set_granted_teams, set_timestamp), per-device buffer callback
management, trace event type filtering, thread-local timestamp/teams
state, and trace record production helpers
Modified files:
- interface.cpp: Add InterfaceRAII for getTraceGenerators<>() alongside
existing getCallbacks<>() in targetData (enter/exit/update),
targetKernel, and target_kernel_replay
- device.cpp: Add TracerInterfaceRAII for submitData, retrieveData,
dataExchange with RTL->getProfiler(); add InterfaceRAII trace
generators for allocData, deleteData; add setAsyncInfoSynchronous
helper for forced-sync tracing; always use async path in dataExchange
- device.h: Add ForceSynchronousTargetRegions bool for tracing control
- omptarget.cpp: Add TracerInterfaceRAII for kernel submit
(ompt_callback_target_submit) around Device.launchKernel
- PluginManager.cpp: Create/destroy OmptTracingBufferMgr in init/deinit,
flush and shutdown helper threads in unregisterLib
- Callback.cpp: Add announceTargetRegion, ompt_set_frame_enter binding,
extra includes and pragma push/pop for DEBUG_PREFIX
- exports: Add libomptarget_ompt_* symbol exports (8 entry points)
- CMakeLists.txt: Add OmptTracing.cpp, link PluginOmpt when OMPT enabled
Made-with: Cursor
---
offload/include/device.h | 3 +
offload/libomptarget/CMakeLists.txt | 4 +
offload/libomptarget/OpenMP/OMPT/Callback.cpp | 16 +
.../libomptarget/OpenMP/OMPT/OmptTracing.cpp | 885 ++++++++++++++++++
offload/libomptarget/PluginManager.cpp | 19 +
offload/libomptarget/device.cpp | 69 +-
offload/libomptarget/exports | 8 +
offload/libomptarget/interface.cpp | 44 +-
offload/libomptarget/omptarget.cpp | 6 +-
9 files changed, 1025 insertions(+), 29 deletions(-)
create mode 100644 offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
diff --git a/offload/include/device.h b/offload/include/device.h
index af103c316c3cf..5a4aa552a790e 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -49,6 +49,9 @@ struct DeviceTy {
GenericPluginTy *RTL;
int32_t RTLDeviceID;
+ /// Flag to force synchronous execution (used by OMPT device tracing)
+ bool ForceSynchronousTargetRegions = false;
+
DeviceTy(GenericPluginTy *RTL, int32_t DeviceID, int32_t RTLDeviceID);
// DeviceTy is not copyable
DeviceTy(const DeviceTy &D) = delete;
diff --git a/offload/libomptarget/CMakeLists.txt b/offload/libomptarget/CMakeLists.txt
index 33b32d21e3c69..db75d72802441 100644
--- a/offload/libomptarget/CMakeLists.txt
+++ b/offload/libomptarget/CMakeLists.txt
@@ -17,6 +17,7 @@ add_library(omptarget SHARED
OpenMP/Mapping.cpp
OpenMP/InteropAPI.cpp
OpenMP/OMPT/Callback.cpp
+ OpenMP/OMPT/OmptTracing.cpp
OpenMP/OMPT/OmptTracingBuffer.cpp
KernelLanguage/API.cpp
@@ -48,6 +49,9 @@ endforeach()
target_compile_options(omptarget PRIVATE ${offload_compile_flags})
target_link_options(omptarget PRIVATE ${offload_link_flags})
+if (OMPT_TARGET_DEFAULT AND LIBOMPTARGET_OMPT_SUPPORT)
+ target_link_libraries(omptarget PRIVATE PluginOmpt)
+endif()
# libomptarget.so needs to be aware of where the plugins live as they
# are now separated in the build directory.
diff --git a/offload/libomptarget/OpenMP/OMPT/Callback.cpp b/offload/libomptarget/OpenMP/OMPT/Callback.cpp
index 1e03f1455d1b2..5aec281779387 100644
--- a/offload/libomptarget/OpenMP/OMPT/Callback.cpp
+++ b/offload/libomptarget/OpenMP/OMPT/Callback.cpp
@@ -12,9 +12,13 @@
#ifdef OMPT_SUPPORT
+#include <atomic>
+#include <cassert>
#include <cstdlib>
#include <cstring>
#include <memory>
+#include <mutex>
+#include <thread>
#include "Shared/Debug.h"
@@ -23,7 +27,9 @@
#include "OpenMP/OMPT/Interface.h"
#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#pragma push_macro("DEBUG_PREFIX")
#undef DEBUG_PREFIX
#define DEBUG_PREFIX "OMPT"
@@ -56,6 +62,7 @@ ompt_get_callback_t llvm::omp::target::ompt::lookupCallbackByCode = nullptr;
ompt_function_lookup_t llvm::omp::target::ompt::lookupCallbackByName = nullptr;
ompt_get_target_task_data_t ompt_get_target_task_data_fn = nullptr;
ompt_get_task_data_t ompt_get_task_data_fn = nullptr;
+ompt_set_frame_enter_t ompt_set_frame_enter_fn = nullptr;
/// Unique correlation id
static std::atomic<uint64_t> IdCounter(1);
@@ -441,6 +448,11 @@ void Interface::endTarget(int64_t DeviceId, void *Code) {
endTargetRegion();
}
+void Interface::announceTargetRegion(const char *RegionName) {
+ ODBG(ODT_Tool) << "in Interface::target_region_" << RegionName
+ << " target_id=" << TargetData.value;
+}
+
void Interface::beginTargetDataOperation() {
ODBG(ODT_Tool) << "in ompt_target_region_begin (TargetRegionId = "
<< TargetData.value << ")";
@@ -506,6 +518,7 @@ int llvm::omp::target::ompt::initializeLibrary(ompt_function_lookup_t lookup,
bindOmptFunctionName(ompt_get_callback, lookupCallbackByCode);
bindOmptFunctionName(ompt_get_task_data, ompt_get_task_data_fn);
bindOmptFunctionName(ompt_get_target_task_data, ompt_get_target_task_data_fn);
+ bindOmptFunctionName(ompt_set_frame_enter, ompt_set_frame_enter_fn);
#undef bindOmptFunctionName
// Store pointer of 'ompt_libomp_target_fn_lookup' for use by libomptarget
@@ -516,6 +529,8 @@ int llvm::omp::target::ompt::initializeLibrary(ompt_function_lookup_t lookup,
assert(ompt_get_task_data_fn && "ompt_get_task_data_fn should be non-null");
assert(ompt_get_target_task_data_fn &&
"ompt_get_target_task_data_fn should be non-null");
+ assert(ompt_set_frame_enter_fn &&
+ "ompt_set_frame_enter_fn should be non-null");
assert(LibraryFinalizer == nullptr &&
"LibraryFinalizer should not be initialized yet");
@@ -562,4 +577,5 @@ void llvm::omp::target::ompt::connectLibrary() {
ODBG(ODT_Tool) << "Exiting connectLibrary";
}
+#pragma pop_macro("DEBUG_PREFIX")
#endif // OMPT_SUPPORT
diff --git a/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp b/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
new file mode 100644
index 0000000000000..78b81f377dab4
--- /dev/null
+++ b/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
@@ -0,0 +1,885 @@
+//===-- OmptTracing.cpp - Target independent OpenMP target RTL --- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of OMPT tracing interfaces for target independent layer
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef OMPT_SUPPORT
+
+#include "OpenMP/OMPT/OmptTracing.h"
+#include "OpenMP/OMPT/Callback.h"
+#include "OpenMP/OMPT/Interface.h"
+#include "OpenMP/OMPT/OmptTracingBuffer.h"
+#include "PluginManager.h"
+#include "Shared/Debug.h"
+#include "omp-tools.h"
+
+#include "llvm/Support/DynamicLibrary.h"
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#pragma push_macro("DEBUG_PREFIX")
+#undef DEBUG_PREFIX
+#define DEBUG_PREFIX "OMPT"
+
+using namespace llvm::omp::target::ompt;
+
+std::mutex llvm::omp::target::ompt::DeviceAccessMutex;
+std::mutex llvm::omp::target::ompt::TraceAccessMutex;
+std::mutex llvm::omp::target::ompt::TraceControlMutex;
+std::mutex llvm::omp::target::ompt::TraceHashThreadMutex;
+std::mutex llvm::omp::target::ompt::BufferManagementFnMutex;
+
+std::unordered_map<int /*DeviceId*/, std::pair<ompt_callback_buffer_request_t,
+ ompt_callback_buffer_complete_t>>
+ llvm::omp::target::ompt::BufferManagementFns;
+
+thread_local uint32_t llvm::omp::target::ompt::TraceRecordNumGrantedTeams = 0;
+thread_local uint64_t llvm::omp::target::ompt::TraceRecordStartTime = 0;
+thread_local uint64_t llvm::omp::target::ompt::TraceRecordStopTime = 0;
+thread_local uint64_t llvm::omp::target::ompt::ThreadId =
+ std::numeric_limits<uint64_t>::max();
+
+std::map<int32_t, uint64_t> llvm::omp::target::ompt::TracedDevices;
+
+bool llvm::omp::target::ompt::TracingActive = false;
+
+void llvm::omp::target::ompt::resetTimestamp(uint64_t *T) { *T = 0; }
+
+ompt_callback_buffer_request_t
+llvm::omp::target::ompt::getBufferRequestFn(int DeviceId) {
+ std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
+ auto BufferMgrItr = BufferManagementFns.find(DeviceId);
+ if (BufferMgrItr == BufferManagementFns.end()) {
+ return nullptr;
+ }
+ return BufferMgrItr->second.first;
+}
+
+ompt_callback_buffer_complete_t
+llvm::omp::target::ompt::getBufferCompleteFn(int DeviceId) {
+ std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
+ auto BufferMgrItr = BufferManagementFns.find(DeviceId);
+ if (BufferMgrItr == BufferManagementFns.end()) {
+ return nullptr;
+ }
+ return BufferMgrItr->second.second;
+}
+
+void llvm::omp::target::ompt::setBufferManagementFns(
+ int DeviceId, ompt_callback_buffer_request_t ReqFn,
+ ompt_callback_buffer_complete_t CmpltFn) {
+ std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
+ auto BufferMgrItr = BufferManagementFns.find(DeviceId);
+ if (BufferMgrItr != BufferManagementFns.end()) {
+ REPORT() << "Buffer request and complete functions already exist for device "
+ << DeviceId << "ignoring ...";
+ return;
+ }
+ BufferManagementFns[DeviceId] = std::make_pair(ReqFn, CmpltFn);
+}
+
+void llvm::omp::target::ompt::removeBufferManagementFns(int DeviceId) {
+ std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
+ auto BufferMgrItr = BufferManagementFns.find(DeviceId);
+ if (BufferMgrItr == BufferManagementFns.end()) {
+ REPORT() << "Buffer request and complete functions don't exist for device "
+ << DeviceId << "ignoring ...";
+ return;
+ }
+ BufferManagementFns.erase(BufferMgrItr);
+}
+
+bool llvm::omp::target::ompt::isAllDeviceTracingStopped() {
+ std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
+ return BufferManagementFns.empty();
+}
+
+void llvm::omp::target::ompt::ompt_callback_buffer_request(
+ int DeviceId, ompt_buffer_t **BufferPtr, size_t *Bytes) {
+ if (auto Fn = getBufferRequestFn(DeviceId))
+ Fn(DeviceId, BufferPtr, Bytes);
+}
+
+void llvm::omp::target::ompt::ompt_callback_buffer_complete(
+ int DeviceId, ompt_buffer_t *Buffer, size_t Bytes,
+ ompt_buffer_cursor_t BeginCursor, int BufferOwned) {
+ if (auto Fn = getBufferCompleteFn(DeviceId))
+ Fn(DeviceId, Buffer, Bytes, BeginCursor, BufferOwned);
+}
+
+inline void setDeviceTracing(uint64_t &TracingTypes) {
+ // Set bit 0 to indicate generally enabled device tracing.
+ TracingTypes |= 1UL;
+}
+
+inline void resetDeviceTracing(uint64_t &TracingTypes) {
+ // Reset bit 0 to indicate generally disabled device tracing.
+ TracingTypes &= ~(1UL);
+}
+
+inline bool checkDeviceTracingState(const uint64_t &TracingTypes) {
+ // Return state of bit 0 to indicate if device is actively traced.
+ return TracingTypes & 1UL;
+}
+
+void llvm::omp::target::ompt::enableDeviceTracing(int DeviceId) {
+ std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
+ auto Device = TracedDevices.find(DeviceId);
+ if (Device == TracedDevices.end()) {
+ uint64_t TracingTypes{0};
+ setDeviceTracing(TracingTypes);
+ TracedDevices.emplace(DeviceId, TracingTypes);
+ } else
+ setDeviceTracing(Device->second);
+ // In any case: at least one device is traced
+ TracingActive = true;
+}
+
+void llvm::omp::target::ompt::disableDeviceTracing(int DeviceId) {
+ std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
+ auto Device = TracedDevices.find(DeviceId);
+ if (Device == TracedDevices.end()) {
+ uint64_t TracingTypes{0};
+ resetDeviceTracing(TracingTypes);
+ TracedDevices.emplace(DeviceId, TracingTypes);
+ } else
+ resetDeviceTracing(Device->second);
+
+ // Check for actively traced devices
+ for (auto &Dev : TracedDevices)
+ if (checkDeviceTracingState(Dev.second))
+ return;
+
+ // If no device is currently traced: set global tracing flag to false
+ TracingActive = false;
+}
+
+bool llvm::omp::target::ompt::isTracingEnabled(int DeviceId,
+ unsigned int EventTy) {
+ return TracingActive && isTracedDevice(DeviceId) &&
+ isTracingTypeGroupEnabled(DeviceId, EventTy);
+}
+
+bool llvm::omp::target::ompt::isTracedDevice(int DeviceId) {
+ std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
+ auto Device = TracedDevices.find(DeviceId);
+ if (Device != TracedDevices.end())
+ return checkDeviceTracingState(Device->second);
+
+ return false;
+}
+
+bool llvm::omp::target::ompt::isTracingTypeEnabled(int DeviceId,
+ unsigned int EventTy) {
+ std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
+ // Make sure we do not shift more than std::numeric_limits<uint64_t>::digits
+ assert(EventTy < 64 && "Shift limit exceeded: EventTy must be less than 64");
+ auto Device = TracedDevices.find(DeviceId);
+ if (Device != TracedDevices.end() && EventTy < 64)
+ return (Device->second & (1UL << EventTy));
+ return false;
+}
+
+bool llvm::omp::target::ompt::isTracingTypeGroupEnabled(int DeviceId,
+ unsigned int EventTy) {
+ std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
+ // Make sure we do not shift more than std::numeric_limits<uint64_t>::digits
+ assert(EventTy < 64 && "Shift limit exceeded: EventTy must be less than 64");
+ auto Device = TracedDevices.find(DeviceId);
+ if (Device != TracedDevices.end() && EventTy < 64) {
+ auto TracedEvents = Device->second;
+ switch (EventTy) {
+ case ompt_callbacks_t::ompt_callback_target:
+ case ompt_callbacks_t::ompt_callback_target_emi:
+ return ((TracedEvents & (1UL << ompt_callback_target))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_emi)));
+ case ompt_callbacks_t::ompt_callback_target_data_op:
+ case ompt_callbacks_t::ompt_callback_target_data_op_emi:
+ return ((TracedEvents & (1UL << ompt_callback_target_data_op))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_data_op_emi)));
+ case ompt_callbacks_t::ompt_callback_target_submit:
+ case ompt_callbacks_t::ompt_callback_target_submit_emi:
+ return ((TracedEvents & (1UL << ompt_callback_target_submit))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_submit_emi)));
+ // Special case: EventTy == 0 -> Check all EventTy
+ case 0:
+ return ((TracedEvents & (1UL << ompt_callback_target))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_emi))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_data_op))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_data_op_emi))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_submit))) ||
+ ((TracedEvents & (1UL << ompt_callback_target_submit_emi)));
+ }
+ }
+ return false;
+}
+
+void llvm::omp::target::ompt::setTracingTypeEnabled(uint64_t &TracedEventTy,
+ bool Enable,
+ unsigned int EventTy) {
+ // Make sure we do not shift more than std::numeric_limits<uint64_t>::digits
+ assert(EventTy < 64 && "Shift limit exceeded: EventTy must be less than 64");
+ if (EventTy < 64) {
+ if (Enable)
+ TracedEventTy |= (1UL << EventTy);
+ else
+ TracedEventTy &= ~(1UL << EventTy);
+ }
+}
+
+ompt_set_result_t
+llvm::omp::target::ompt::setTraceEventTy(int DeviceId, unsigned int Enable,
+ unsigned int EventTy) {
+ if (DeviceId < 0) {
+ REPORT() << "Failed to set trace event type for DeviceId=" << DeviceId;
+ return ompt_set_never;
+ }
+
+ ODBG(ODT_Tool) << "Executing setTraceEventTy: DeviceId=" << DeviceId
+ << " Enable=" << Enable << " EventTy=" << EventTy;
+
+ std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
+ if (TracedDevices.find(DeviceId) == TracedDevices.end())
+ TracedDevices.emplace(DeviceId, 0UL);
+
+ auto &TracedEventTy = TracedDevices[DeviceId];
+ bool Enabled = Enable > 0;
+ if (EventTy == 0) {
+ // Set / reset all supported types
+ setTracingTypeEnabled(TracedEventTy, Enabled,
+ ompt_callbacks_t::ompt_callback_target);
+ setTracingTypeEnabled(TracedEventTy, Enabled,
+ ompt_callbacks_t::ompt_callback_target_data_op);
+ setTracingTypeEnabled(TracedEventTy, Enabled,
+ ompt_callbacks_t::ompt_callback_target_submit);
+ setTracingTypeEnabled(TracedEventTy, Enabled,
+ ompt_callbacks_t::ompt_callback_target_emi);
+ setTracingTypeEnabled(TracedEventTy, Enabled,
+ ompt_callbacks_t::ompt_callback_target_data_op_emi);
+ setTracingTypeEnabled(TracedEventTy, Enabled,
+ ompt_callbacks_t::ompt_callback_target_submit_emi);
+
+ if (Enabled) {
+ // Event subset is enabled
+ return ompt_set_sometimes;
+ } else {
+ // All events are disabled
+ return ompt_set_always;
+ }
+ }
+
+ switch (EventTy) {
+ case ompt_callbacks_t::ompt_callback_target:
+ case ompt_callbacks_t::ompt_callback_target_data_op:
+ case ompt_callbacks_t::ompt_callback_target_submit:
+ case ompt_callbacks_t::ompt_callback_target_emi:
+ case ompt_callbacks_t::ompt_callback_target_data_op_emi:
+ case ompt_callbacks_t::ompt_callback_target_submit_emi: {
+ setTracingTypeEnabled(TracedEventTy, Enabled, EventTy);
+ return ompt_set_always;
+ }
+ default: {
+ if (Enabled) {
+ // Unimplemented
+ return ompt_set_never;
+ } else {
+ // Always disabled anyways
+ return ompt_set_always;
+ }
+ }
+ }
+}
+
+uint64_t llvm::omp::target::ompt::getThreadId() {
+ // Grab the value from thread local storage, if valid.
+ if (ThreadId != std::numeric_limits<uint64_t>::max())
+ return ThreadId;
+ // Otherwise set it, protecting the hash with a lock.
+ std::unique_lock<std::mutex> Lock(TraceHashThreadMutex);
+ ThreadId = std::hash<std::thread::id>()(std::this_thread::get_id());
+ return ThreadId;
+}
+
+void Interface::setTraceRecordCommon(ompt_record_ompt_t *DataPtr,
+ ompt_callbacks_t CallbackType) {
+ DataPtr->type = CallbackType;
+
+ if (CallbackType == ompt_callback_target)
+ DataPtr->time = 0; // Currently, no consumer, so no need to set it
+ else {
+ DataPtr->time = TraceRecordStartTime;
+ resetTimestamp(&TraceRecordStartTime);
+ }
+
+ DataPtr->thread_id = getThreadId();
+ DataPtr->target_id = TargetData.value;
+}
+
+void Interface::setTraceRecordTargetDataOp(ompt_record_target_data_op_t *Record,
+ ompt_target_data_op_t DataOpType,
+ void *SrcAddr, int64_t SrcDeviceNum,
+ void *DstAddr, int64_t DstDeviceNum,
+ size_t Bytes, void *CodePtr) {
+ Record->host_op_id = HostOpId;
+ Record->optype = DataOpType;
+ Record->src_addr = SrcAddr;
+ Record->src_device_num = SrcDeviceNum;
+ Record->dest_addr = DstAddr;
+ Record->dest_device_num = DstDeviceNum;
+ Record->bytes = Bytes;
+
+ Record->end_time = TraceRecordStopTime;
+ resetTimestamp(&TraceRecordStopTime);
+
+ Record->codeptr_ra = CodePtr;
+}
+
+void Interface::setTraceRecordTargetKernel(ompt_record_target_kernel_t *Record,
+ unsigned int NumTeams) {
+ Record->host_op_id = HostOpId;
+ Record->requested_num_teams = NumTeams;
+ Record->granted_num_teams = TraceRecordNumGrantedTeams;
+
+ Record->end_time = TraceRecordStopTime;
+ resetTimestamp(&TraceRecordStopTime);
+}
+
+void Interface::setTraceRecordTarget(ompt_record_target_t *Record,
+ int64_t DeviceId, ompt_target_t TargetKind,
+ ompt_scope_endpoint_t Endpoint,
+ void *CodePtr) {
+ Record->kind = TargetKind;
+ Record->endpoint = Endpoint;
+ Record->device_num = DeviceId;
+ assert(TaskData);
+ Record->task_id = TaskData->value;
+ Record->target_id = TargetData.value;
+ Record->codeptr_ra = CodePtr;
+}
+
+void Interface::startTargetDataAllocTrace(int64_t DeviceId, void *HstPtrBegin,
+ void **TgtPtrBegin, size_t Size,
+ void *Code) {}
+
+ompt_record_ompt_t *Interface::stopTargetDataAllocTrace(int64_t DeviceId,
+ void *HstPtrBegin,
+ void **TgtPtrBegin,
+ size_t Size,
+ void *Code) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target_data_op))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr = (ompt_record_ompt_t *)TRM->assignCursor(
+ ompt_callback_target_data_op, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target_data_op);
+ setTraceRecordTargetDataOp(&DataPtr->record.target_data_op,
+ ompt_target_data_alloc, HstPtrBegin,
+ /*SrcDeviceNum=*/omp_get_initial_device(),
+ *TgtPtrBegin, DeviceId, Size, Code);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Generated trace record: " << DataPtr
+ << " (ompt_target_data_alloc)";
+ return DataPtr;
+}
+
+void Interface::startTargetDataDeleteTrace(int64_t DeviceId, void *TgtPtrBegin,
+ void *Code) {}
+
+ompt_record_ompt_t *Interface::stopTargetDataDeleteTrace(int64_t DeviceId,
+ void *TgtPtrBegin,
+ void *Code) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target_data_op))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr = (ompt_record_ompt_t *)TRM->assignCursor(
+ ompt_callback_target_data_op, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target_data_op);
+ setTraceRecordTargetDataOp(&DataPtr->record.target_data_op,
+ ompt_target_data_delete, TgtPtrBegin, DeviceId,
+ /*DstAddr=*/nullptr,
+ /*DstDeviceNum=*/-1, /*Bytes=*/0, Code);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Generated trace record: " << DataPtr
+ << " (ompt_target_data_delete)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *
+Interface::startTargetDataSubmitTrace(int64_t SrcDeviceId, void *SrcPtrBegin,
+ int64_t DstDeviceId, void *DstPtrBegin,
+ size_t Size, void *Code) {
+ if (!isTracingEnabled(DstDeviceId, ompt_callback_target_data_op))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr = (ompt_record_ompt_t *)TRM->assignCursor(
+ ompt_callback_target_data_op, DstDeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target_data_op);
+ DataPtr->time = 0; // Set to sanity value and let "stop" function fix it
+
+ // Set some of the data-op specific fields here
+ setTraceRecordTargetDataOp(&DataPtr->record.target_data_op,
+ ompt_target_data_transfer_to_device, SrcPtrBegin,
+ SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
+
+ ODBG(ODT_Tool) << "OMPT-Async: Returning data trace record buf ptr " << DataPtr;
+ return DataPtr;
+}
+
+ompt_record_ompt_t *
+Interface::startTargetDataRetrieveTrace(int64_t SrcDeviceId, void *SrcPtrBegin,
+ int64_t DstDeviceId, void *DstPtrBegin,
+ size_t Size, void *Code) {
+ if (!isTracingEnabled(SrcDeviceId, ompt_callback_target_data_op))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr = (ompt_record_ompt_t *)TRM->assignCursor(
+ ompt_callback_target_data_op, SrcDeviceId);
+
+ if (!DataPtr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target_data_op);
+ DataPtr->time = 0; // Set to sanity value and let "stop" function fix it
+
+ // Set some of the data-op specific fields here
+ setTraceRecordTargetDataOp(&DataPtr->record.target_data_op,
+ ompt_target_data_transfer_from_device, SrcPtrBegin,
+ SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
+
+ ODBG(ODT_Tool) << "OMPT-Async: Returning data trace record buf ptr " << DataPtr;
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::stopTargetDataMovementTraceAsync(
+ ompt_record_ompt_t *DataPtr, uint64_t NanosStart, uint64_t NanosEnd) {
+ // Finalize the data that comes from the plugin.
+ DataPtr->time = NanosStart;
+ auto Record = static_cast<ompt_record_target_data_op_t *>(
+ &DataPtr->record.target_data_op);
+ Record->end_time = NanosEnd;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "OMPT-Async: Completed target_data trace record " << DataPtr;
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::startTargetSubmitTrace(int64_t DeviceId,
+ unsigned int NumTeams) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target_submit))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr = (ompt_record_ompt_t *)TRM->assignCursor(
+ ompt_callback_target_submit, DeviceId);
+
+ // Set all known entries and leave remaining to the stop function
+ setTraceRecordCommon(DataPtr, ompt_callback_target_submit);
+ DataPtr->time = 0; // Set to sanity value and let "stop" function fix it
+ // Kernel specific things
+ DataPtr->record.target_kernel.requested_num_teams = NumTeams;
+ DataPtr->record.target_kernel.host_op_id = getHostOpId();
+
+ // May be null if event is not traced
+ ODBG(ODT_Tool) << "OMPT-Async: Returning kernel trace record buf ptr " << DataPtr;
+ return DataPtr;
+}
+
+ompt_record_ompt_t *
+Interface::stopTargetSubmitTraceAsync(ompt_record_ompt_t *DataPtr,
+ unsigned int NumTeams,
+ uint64_t NanosStart, uint64_t NanosStop) {
+ // Common fields
+ DataPtr->time = NanosStart;
+ // Submit specific
+ DataPtr->record.target_kernel.end_time = NanosStop;
+ DataPtr->record.target_kernel.granted_num_teams = NumTeams;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ // Ready Record
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "OMPT-Async: Completed trace record buf ptr " << DataPtr;
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::startTargetDataEnterTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId,
+ ompt_target_enter_data, ompt_scope_begin, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ << " (ompt_target_enter_data)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::stopTargetDataEnterTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId,
+ ompt_target_enter_data, ompt_scope_end, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ << " (ompt_target_enter_data)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::startTargetDataExitTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId, ompt_target_exit_data,
+ ompt_scope_begin, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ << " (ompt_target_exit_data)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::stopTargetDataExitTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId, ompt_target_exit_data,
+ ompt_scope_end, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ << " (ompt_target_exit_data)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::startTargetUpdateTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId, ompt_target_update,
+ ompt_scope_begin, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ << " (ompt_target_update)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::stopTargetUpdateTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId, ompt_target_update,
+ ompt_scope_end, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ << " (ompt_target_update)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::startTargetTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId, ompt_target,
+ ompt_scope_begin, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+ ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ << " (ompt_target)";
+ return DataPtr;
+}
+
+ompt_record_ompt_t *Interface::stopTargetTrace(int64_t DeviceId,
+ void *CodePtr) {
+ if (!isTracingEnabled(DeviceId, ompt_callback_target))
+ return nullptr;
+
+ assert(PM && "Plugin manager not initialized");
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ ompt_record_ompt_t *DataPtr =
+ (ompt_record_ompt_t *)TRM->assignCursor(ompt_callback_target, DeviceId);
+
+ // This event will not be traced
+ if (DataPtr == nullptr)
+ return nullptr;
+
+ setTraceRecordCommon(DataPtr, ompt_callback_target);
+ setTraceRecordTarget(&DataPtr->record.target, DeviceId, ompt_target,
+ ompt_scope_end, CodePtr);
+
+ // The trace record has been created, mark it ready for delivery to the tool
+ TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
+
+ ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ << " (ompt_target)";
+ return DataPtr;
+}
+
+extern "C" {
+// Device-independent entry point for ompt_set_trace_ompt
+ompt_set_result_t libomptarget_ompt_set_trace_ompt(int DeviceId,
+ unsigned int Enable,
+ unsigned int EventTy) {
+ std::unique_lock<std::mutex> Lock(TraceAccessMutex);
+ return llvm::omp::target::ompt::setTraceEventTy(DeviceId, Enable, EventTy);
+}
+
+// Device-independent entry point for ompt_start_trace
+int libomptarget_ompt_start_trace(int DeviceId,
+ ompt_callback_buffer_request_t Request,
+ ompt_callback_buffer_complete_t Complete) {
+ if (!PM) {
+ REPORT() << "Failed to start trace for DeviceId="
+ << DeviceId << " (invalid plugin manager)";
+ // Indicate failure
+ return 0;
+ }
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ std::unique_lock<std::mutex> Lock(TraceControlMutex);
+ if (Request && Complete) {
+ // Set buffer related functions
+ llvm::omp::target::ompt::setBufferManagementFns(DeviceId, Request,
+ Complete);
+ llvm::omp::target::ompt::enableDeviceTracing(DeviceId);
+ TRM->startHelperThreads();
+ // Success
+ return 1;
+ }
+ // Failure
+ return 0;
+}
+
+// Device-independent entry point for ompt_flush_trace
+int libomptarget_ompt_flush_trace(int DeviceId) {
+ if (!PM) {
+ REPORT() << "Failed to flush trace for DeviceId="
+ << DeviceId << " (invalid plugin manager)";
+ // Indicate failure
+ return 0;
+ }
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ std::unique_lock<std::mutex> Lock(TraceControlMutex);
+ return TRM->flushAllBuffers(DeviceId);
+}
+
+// Device independent entry point for ompt_stop_trace
+int libomptarget_ompt_stop_trace(int DeviceId) {
+ if (!PM) {
+ REPORT() << "Failed to stop trace for DeviceId="
+ << DeviceId << " (invalid plugin manager)";
+ // Indicate failure
+ return 0;
+ }
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ std::unique_lock<std::mutex> Lock(TraceControlMutex);
+
+ // Schedule flushing of trace records for this device
+ int Status = TRM->flushAllBuffers(DeviceId);
+
+ // De-register this device so that no more traces are collected
+ // or delivered for this device until an ompt_start_trace is
+ // invoked for this device.
+ removeBufferManagementFns(DeviceId);
+
+ // If no device is being traced, shut down the helper threads. A
+ // subsequent ompt_start_trace will start up the helper threads.
+ if (isAllDeviceTracingStopped()) {
+ // TODO shutdown should perhaps return a status
+ TRM->shutdownHelperThreads();
+ llvm::omp::target::ompt::disableDeviceTracing(DeviceId);
+ }
+ return Status;
+}
+
+// Device independent entry point for ompt_advance_buffer_cursor
+// Note: The input parameter size is unused here. It refers to the
+// bytes returned in the corresponding callback.
+int libomptarget_ompt_advance_buffer_cursor(ompt_device_t *Device,
+ ompt_buffer_t *Buffer, size_t Size,
+ ompt_buffer_cursor_t CurrentPos,
+ ompt_buffer_cursor_t *NextPos) {
+ if (!PM) {
+ REPORT() << "Failed to advance buffer cursor for Device="
+ << Device << " (invalid plugin manager)";
+ // Indicate failure
+ return false;
+ }
+ OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
+ char *TraceRecord = (char *)CurrentPos;
+ // Don't assert if CurrentPos is null, just indicate end of buffer
+ if (TraceRecord == nullptr || TRM->isLastCursor(TraceRecord)) {
+ *NextPos = 0;
+ return false;
+ }
+ // TODO In debug mode, assert that the metadata points to the
+ // input parameter buffer
+
+ size_t TRSize = TRM->getTRSize();
+ *NextPos = (ompt_buffer_cursor_t)(TraceRecord + TRSize);
+ ODBG(ODT_Tool) << "Advanced buffer pointer by "
+ << TRSize << " bytes to "
+ << TraceRecord + TRSize;
+ return true;
+}
+
+// This function is invoked before the kernel launch. So, when the trace record
+// is populated after kernel completion, TraceRecordNumGrantedTeams is already
+// updated.
+void libomptarget_ompt_set_granted_teams(uint32_t NumTeams) {
+ TraceRecordNumGrantedTeams = NumTeams;
+}
+
+// Assume a synchronous implementation and set thread local variables to track
+// timestamps. The thread local variables can then be used to populate trace
+// records.
+void libomptarget_ompt_set_timestamp(uint64_t Start, uint64_t Stop) {
+ TraceRecordStartTime = Start;
+ TraceRecordStopTime = Stop;
+}
+
+// Device-independent entry point to query for the trace format used.
+// Currently, only OMPT format is supported.
+ompt_record_t
+libomptarget_ompt_get_record_type(ompt_buffer_t *Buffer,
+ ompt_buffer_cursor_t CurrentPos) {
+ // TODO: When different OMPT trace buffer formats supported, this needs to be
+ // fixed.
+ return ompt_record_t::ompt_record_ompt;
+}
+} // extern "C"
+
+#pragma pop_macro("DEBUG_PREFIX")
+
+#endif // OMPT_SUPPORT
diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp
index 41b653a60adfd..a91e3ca954d5c 100644
--- a/offload/libomptarget/PluginManager.cpp
+++ b/offload/libomptarget/PluginManager.cpp
@@ -11,6 +11,9 @@
//===----------------------------------------------------------------------===//
#include "PluginManager.h"
+#include "OpenMP/OMPT/Callback.h"
+#include "OpenMP/OMPT/OmptCommonDefs.h"
+#include "OpenMP/OMPT/OmptTracing.h"
#include "OffloadPolicy.h"
#include "Shared/Debug.h"
#include "Shared/Profile.h"
@@ -47,6 +50,12 @@ void PluginManager::init() {
} while (false);
#include "Shared/Targets.def"
+#ifdef OMPT_SUPPORT
+ assert(TraceRecordManager == nullptr &&
+ "Expected trace record manager to be null");
+ TraceRecordManager = new OmptTracingBufferMgr();
+#endif
+
ODBG(ODT_Init) << "RTLs loaded!";
}
@@ -54,6 +63,13 @@ void PluginManager::deinit() {
TIMESCOPE();
ODBG(ODT_Deinit) << "Unloading RTLs...";
+#ifdef OMPT_SUPPORT
+ assert(TraceRecordManager != nullptr &&
+ "Trace record manager should have been non-null");
+ delete TraceRecordManager;
+ TraceRecordManager = nullptr;
+#endif
+
for (auto &Plugin : Plugins) {
if (!Plugin->is_initialized())
continue;
@@ -323,6 +339,9 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
ODBG(ODT_Deinit) << "Unloading target library!";
+ OMPT_IF_TRACING_ENABLED(
+ PM->getTraceRecordManager()->flushAndShutdownHelperThreads(););
+
Desc = upgradeLegacyEntries(Desc);
PM->RTLsMtx.lock();
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 546f679353544..f13354a27c624 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -15,6 +15,8 @@
#include "OpenMP/Mapping.h"
#include "OpenMP/OMPT/Callback.h"
#include "OpenMP/OMPT/Interface.h"
+#include "OpenMP/OMPT/OmptCommonDefs.h"
+#include "OpenMP/OMPT/OmptTracing.h"
#include "PluginManager.h"
#include "Shared/APITypes.h"
#include "Shared/Debug.h"
@@ -34,7 +36,8 @@
#include <thread>
#ifdef OMPT_SUPPORT
-using namespace llvm::omp::target::ompt;
+using namespace llvm::omp::target;
+using namespace ompt;
#endif
using namespace llvm::omp::target::plugin;
@@ -69,7 +72,7 @@ int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
DeviceTy::DeviceTy(GenericPluginTy *RTL, int32_t DeviceID, int32_t RTLDeviceID)
: DeviceID(DeviceID), RTL(RTL), RTLDeviceID(RTLDeviceID),
- MappingInfo(*this) {}
+ ForceSynchronousTargetRegions(false), MappingInfo(*this) {}
DeviceTy::~DeviceTy() {
if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
@@ -79,6 +82,11 @@ DeviceTy::~DeviceTy() {
dumpTargetPointerMappings(&Loc, *this);
}
+inline void setAsyncInfoSynchronous(__tgt_async_info *AI, bool SetSynchronous) {
+ if (SetSynchronous)
+ AI->ExecAsync = false;
+}
+
llvm::Error DeviceTy::init() {
int32_t Ret = RTL->init_device(RTLDeviceID);
if (Ret != OFFLOAD_SUCCESS)
@@ -242,10 +250,15 @@ DeviceTy::loadBinary(__tgt_device_image *Img) {
void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
/// RAII to establish tool anchors before and after data allocation
void *TargetPtr = nullptr;
- OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
- RegionInterface.getCallbacks<ompt_target_data_alloc>(),
- DeviceID, HstPtr, &TargetPtr, Size,
- /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
+ OMPT_IF_BUILT(
+ InterfaceRAII TargetDataAllocRAII(
+ RegionInterface.getCallbacks<ompt_target_data_alloc>(), DeviceID,
+ HstPtr, &TargetPtr, Size,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ InterfaceRAII TargetDataAllocTraceRAII(
+ RegionInterface.getTraceGenerators<ompt_target_data_alloc>(),
+ RTLDeviceID, HstPtr, &TargetPtr, Size,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
TargetPtr = RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
return TargetPtr;
@@ -253,11 +266,15 @@ void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
int32_t DeviceTy::deleteData(void *TgtAllocBegin, int32_t Kind) {
/// RAII to establish tool anchors before and after data deletion
- OMPT_IF_BUILT(InterfaceRAII TargetDataDeleteRAII(
- RegionInterface.getCallbacks<ompt_target_data_delete>(),
- DeviceID, TgtAllocBegin,
- /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
-
+ OMPT_IF_BUILT(
+ InterfaceRAII TargetDataDeleteRAII(
+ RegionInterface.getCallbacks<ompt_target_data_delete>(), DeviceID,
+ TgtAllocBegin,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ InterfaceRAII TargetDataDeleteTraceRAII(
+ RegionInterface.getTraceGenerators<ompt_target_data_delete>(),
+ DeviceID, TgtAllocBegin,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
return RTL->data_delete(RTLDeviceID, TgtAllocBegin, Kind);
}
@@ -274,8 +291,16 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
InterfaceRAII TargetDataSubmitRAII(
RegionInterface.getCallbacks<ompt_target_data_transfer_to_device>(),
omp_initial_device, HstPtrBegin, DeviceID, TgtPtrBegin, Size,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ TracerInterfaceRAII TargetDataSubmitTraceRAII(
+ RegionInterface
+ .getTraceGenerators<ompt_target_data_transfer_to_device>(),
+ AsyncInfo, RTL->getProfiler(), /*TracedDeviceId=*/DeviceID,
+ /*EventType=*/ompt_callback_target_data_op, omp_initial_device,
+ HstPtrBegin, DeviceID, TgtPtrBegin, Size,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
+ setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
AsyncInfo);
}
@@ -294,8 +319,16 @@ int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
InterfaceRAII TargetDataRetrieveRAII(
RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
DeviceID, TgtPtrBegin, omp_initial_device, HstPtrBegin, Size,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ TracerInterfaceRAII TargetDataRetrieveTraceRAII(
+ RegionInterface
+ .getTraceGenerators<ompt_target_data_transfer_from_device>(),
+ AsyncInfo, RTL->getProfiler(), /*TracedDeviceId=*/DeviceID,
+ /*EventType=*/ompt_callback_target_data_op, DeviceID, TgtPtrBegin,
+ omp_initial_device, HstPtrBegin, Size,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
+ setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
AsyncInfo);
}
@@ -313,11 +346,16 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
InterfaceRAII TargetDataExchangeRAII(
RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ TracerInterfaceRAII TargetDataExchangeTraceRAII(
+ RegionInterface
+ .getTraceGenerators<ompt_target_data_transfer_from_device>(),
+ AsyncInfo, RTL->getProfiler(), /*TracedDeviceId=*/RTLDeviceID,
+ /*EventType=*/ompt_callback_target_data_op, RTLDeviceID, SrcPtr,
+ DstDev.RTLDeviceID, DstPtr, Size,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
- if (!AsyncInfo) {
- return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
- Size);
- }
+
+ setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
DstPtr, Size, AsyncInfo);
}
@@ -352,6 +390,7 @@ int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
KernelExtraArgsTy *KernelExtraArgs,
AsyncInfoTy &AsyncInfo) {
+ setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->launch_kernel(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
&KernelArgs, KernelExtraArgs, AsyncInfo);
}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 1831c43cc5f29..6e404ed059096 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -67,6 +67,14 @@ VERS1.0 {
llvm_omp_target_unlock_mem;
__tgt_set_info_flag;
__tgt_print_device_info;
+ libomptarget_ompt_set_trace_ompt;
+ libomptarget_ompt_start_trace;
+ libomptarget_ompt_flush_trace;
+ libomptarget_ompt_stop_trace;
+ libomptarget_ompt_set_granted_teams;
+ libomptarget_ompt_set_timestamp;
+ libomptarget_ompt_advance_buffer_cursor;
+ libomptarget_ompt_get_record_type;
omp_get_interop_ptr;
omp_get_interop_str;
omp_get_interop_int;
diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index a436708814c90..1bbc3f63ce292 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -14,6 +14,7 @@
#include "OpenMP/OMPT/Interface.h"
#include "OffloadPolicy.h"
#include "OpenMP/OMPT/Callback.h"
+#include "OpenMP/OMPT/OmptCommonDefs.h"
#include "OpenMP/omp.h"
#include "PluginManager.h"
#include "omptarget.h"
@@ -153,19 +154,30 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
/// RAII to establish tool anchors before and after data begin / end / update
- OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin ||
- TargetDataFunction == targetDataEnd ||
- TargetDataFunction == targetDataUpdate) &&
- "Encountered unexpected TargetDataFunction during "
- "execution of targetData");
- auto CallbackFunctions =
- (TargetDataFunction == targetDataBegin)
- ? RegionInterface.getCallbacks<ompt_target_enter_data>()
- : (TargetDataFunction == targetDataEnd)
- ? RegionInterface.getCallbacks<ompt_target_exit_data>()
- : RegionInterface.getCallbacks<ompt_target_update>();
- InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
- OMPT_GET_RETURN_ADDRESS);)
+ OMPT_IF_BUILT(
+ assert((TargetDataFunction == targetDataBegin ||
+ TargetDataFunction == targetDataEnd ||
+ TargetDataFunction == targetDataUpdate) &&
+ "Encountered unexpected TargetDataFunction during "
+ "execution of targetData");
+ auto CallbackFunctions =
+ (TargetDataFunction == targetDataBegin)
+ ? RegionInterface.getCallbacks<ompt_target_enter_data>()
+ : (TargetDataFunction == targetDataEnd)
+ ? RegionInterface.getCallbacks<ompt_target_exit_data>()
+ : RegionInterface.getCallbacks<ompt_target_update>();
+
+ auto TraceGenerators =
+ (TargetDataFunction == targetDataBegin)
+ ? RegionInterface.getTraceGenerators<ompt_target_enter_data>()
+ : (TargetDataFunction == targetDataEnd)
+ ? RegionInterface.getTraceGenerators<ompt_target_exit_data>()
+ : RegionInterface.getTraceGenerators<ompt_target_update>();
+
+ InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ InterfaceRAII TargetDataTraceRAII(TraceGenerators, DeviceId,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
int Rc = OFFLOAD_SUCCESS;
@@ -433,6 +445,9 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
/// RAII to establish tool anchors before and after target region
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ InterfaceRAII TargetTraceRAII(
+ RegionInterface.getTraceGenerators<ompt_target>(), DeviceId,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
int Rc = OFFLOAD_SUCCESS;
@@ -542,6 +557,9 @@ EXTERN int __tgt_target_kernel_replay(
/// RAII to establish tool anchors before and after target region
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
+ /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
+ InterfaceRAII TargetTraceRAII(
+ RegionInterface.getTraceGenerators<ompt_target>(), DeviceId,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
AsyncInfoTy AsyncInfo(*DeviceOrErr);
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 17b215732d51b..2d55e2bbf00f2 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2347,8 +2347,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
// No need to guard this with OMPT_IF_BUILT
InterfaceRAII TargetSubmitRAII(
RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
-#endif
+ TracerInterfaceRAII TargetTraceRAII(
+ RegionInterface.getTraceGenerators<ompt_callback_target_submit>(),
+ AsyncInfo, Device.RTL->getProfiler(), /*TracedDeviceId=*/DeviceId,
+ /*EventType=*/ompt_callback_target_submit, DeviceId, NumTeams);
+#endif
Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
KernelArgs, nullptr, AsyncInfo);
}
>From c721c5d19717faf91f08a515f52ed2f6c6ff0375 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Mon, 13 Apr 2026 06:26:35 -0500
Subject: [PATCH 08/15] [Offload][OMPT] Fix global destructor warnings in OMPT
tracing globals
Replace five globals with non-trivial destructors with function-local
statics, following the LLVM canonical pattern (see Signals.cpp). This
resolves -Werror=global-constructors failures that were triggered by
the destructor of std::map, std::unordered_map, std::shared_ptr, and
a thread_local array of shared_ptr.
Affected globals:
- ParentLibrary (shared_ptr) in plugin OmptTracing.cpp -> file-local
getParentLibraryStorage() helper
- Devices (map) in plugin OmptTracing.cpp -> getDevices()
- BufferManagementFns (unordered_map) in libomptarget OmptTracing.cpp
-> file-local getBufferMgmt() helper
- TracedDevices (map) in libomptarget OmptTracing.cpp -> getTracedDevices()
- ArrayOfBufPtr (thread_local shared_ptr array) in OmptTracingBuffer.cpp
-> getArrayOfBufPtr()
extern declarations are replaced with getter function declarations in
the corresponding headers (OmptDeviceTracing.h, OmptTracing.h,
OmptTracingBuffer.h). All use-sites are updated accordingly.
Made-with: Cursor
---
offload/include/OpenMP/OMPT/OmptTracing.h | 11 +--
.../include/OpenMP/OMPT/OmptTracingBuffer.h | 5 +-
.../libomptarget/OpenMP/OMPT/OmptTracing.cpp | 72 ++++++++++---------
.../OpenMP/OMPT/OmptTracingBuffer.cpp | 12 ++--
.../common/OMPT/OmptDeviceTracing.h | 20 +++---
.../common/OMPT/OmptTracing.cpp | 41 ++++++-----
6 files changed, 85 insertions(+), 76 deletions(-)
diff --git a/offload/include/OpenMP/OMPT/OmptTracing.h b/offload/include/OpenMP/OMPT/OmptTracing.h
index 2a892582923d5..4c8c049a9ae48 100644
--- a/offload/include/OpenMP/OMPT/OmptTracing.h
+++ b/offload/include/OpenMP/OMPT/OmptTracing.h
@@ -102,8 +102,9 @@ ompt_set_result_t setTraceEventTy(int DeviceId, unsigned int Enable,
/// Return thread id
uint64_t getThreadId();
-/// See TracedDevices in OmptDeviceTracing.h
-extern std::map<int32_t, uint64_t> TracedDevices;
+/// Access the map of RTL device IDs to their currently enabled tracing event
+/// types. See also OmptDeviceTracing.h.
+std::map<int32_t, uint64_t> &getTracedDevices();
/// Activate tracing on the given device
void enableDeviceTracing(int DeviceId);
/// Deactivate tracing on the given device
@@ -123,12 +124,6 @@ extern std::mutex TraceHashThreadMutex;
/// buffer-completion callback functions.
extern std::mutex BufferManagementFnMutex;
-/// Map from device-id to the corresponding buffer-request and buffer-completion
-/// callback functions.
-extern std::unordered_map<int, std::pair<ompt_callback_buffer_request_t,
- ompt_callback_buffer_complete_t>>
- BufferManagementFns;
-
/// Thread local variables used by the plugin to communicate OMPT information
/// that are then used to populate trace records. This method assumes a
/// synchronous implementation, otherwise it won't work.
diff --git a/offload/include/OpenMP/OMPT/OmptTracingBuffer.h b/offload/include/OpenMP/OMPT/OmptTracingBuffer.h
index 5c9f4bf33dae8..a9b7f8d569fcb 100644
--- a/offload/include/OpenMP/OMPT/OmptTracingBuffer.h
+++ b/offload/include/OpenMP/OMPT/OmptTracingBuffer.h
@@ -133,8 +133,9 @@ class OmptTracingBufferMgr {
// Thread-specific array of pointers to a buffer. The buffer pointed to
// is the last one allocated by this thread for a given device. The ith
// element points to the buffer for the ith device. At most MAX_NUM_DEVICES
- // devices are supported.
- static thread_local BufPtr ArrayOfBufPtr[MAX_NUM_DEVICES];
+ // devices are supported. Stored as a function-local thread_local to avoid
+ // a global destructor.
+ static BufPtr *getArrayOfBufPtr();
/*
* A buffer is flushed when it fills up or when the tool invokes
diff --git a/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp b/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
index 78b81f377dab4..2e7b431b71389 100644
--- a/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
+++ b/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
@@ -42,17 +42,25 @@ std::mutex llvm::omp::target::ompt::TraceControlMutex;
std::mutex llvm::omp::target::ompt::TraceHashThreadMutex;
std::mutex llvm::omp::target::ompt::BufferManagementFnMutex;
-std::unordered_map<int /*DeviceId*/, std::pair<ompt_callback_buffer_request_t,
- ompt_callback_buffer_complete_t>>
- llvm::omp::target::ompt::BufferManagementFns;
-
thread_local uint32_t llvm::omp::target::ompt::TraceRecordNumGrantedTeams = 0;
thread_local uint64_t llvm::omp::target::ompt::TraceRecordStartTime = 0;
thread_local uint64_t llvm::omp::target::ompt::TraceRecordStopTime = 0;
thread_local uint64_t llvm::omp::target::ompt::ThreadId =
std::numeric_limits<uint64_t>::max();
-std::map<int32_t, uint64_t> llvm::omp::target::ompt::TracedDevices;
+// File-local helper returning a reference to the buffer-management map stored
+// as a function-local static, avoiding a global destructor.
+static auto &getBufferMgmt() {
+ static std::unordered_map<int, std::pair<ompt_callback_buffer_request_t,
+ ompt_callback_buffer_complete_t>>
+ M;
+ return M;
+}
+
+std::map<int32_t, uint64_t> &llvm::omp::target::ompt::getTracedDevices() {
+ static std::map<int32_t, uint64_t> TracedDevices;
+ return TracedDevices;
+}
bool llvm::omp::target::ompt::TracingActive = false;
@@ -61,8 +69,8 @@ void llvm::omp::target::ompt::resetTimestamp(uint64_t *T) { *T = 0; }
ompt_callback_buffer_request_t
llvm::omp::target::ompt::getBufferRequestFn(int DeviceId) {
std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
- auto BufferMgrItr = BufferManagementFns.find(DeviceId);
- if (BufferMgrItr == BufferManagementFns.end()) {
+ auto BufferMgrItr = getBufferMgmt().find(DeviceId);
+ if (BufferMgrItr == getBufferMgmt().end()) {
return nullptr;
}
return BufferMgrItr->second.first;
@@ -71,8 +79,8 @@ llvm::omp::target::ompt::getBufferRequestFn(int DeviceId) {
ompt_callback_buffer_complete_t
llvm::omp::target::ompt::getBufferCompleteFn(int DeviceId) {
std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
- auto BufferMgrItr = BufferManagementFns.find(DeviceId);
- if (BufferMgrItr == BufferManagementFns.end()) {
+ auto BufferMgrItr = getBufferMgmt().find(DeviceId);
+ if (BufferMgrItr == getBufferMgmt().end()) {
return nullptr;
}
return BufferMgrItr->second.second;
@@ -82,29 +90,29 @@ void llvm::omp::target::ompt::setBufferManagementFns(
int DeviceId, ompt_callback_buffer_request_t ReqFn,
ompt_callback_buffer_complete_t CmpltFn) {
std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
- auto BufferMgrItr = BufferManagementFns.find(DeviceId);
- if (BufferMgrItr != BufferManagementFns.end()) {
+ auto BufferMgrItr = getBufferMgmt().find(DeviceId);
+ if (BufferMgrItr != getBufferMgmt().end()) {
REPORT() << "Buffer request and complete functions already exist for device "
<< DeviceId << "ignoring ...";
return;
}
- BufferManagementFns[DeviceId] = std::make_pair(ReqFn, CmpltFn);
+ getBufferMgmt()[DeviceId] = std::make_pair(ReqFn, CmpltFn);
}
void llvm::omp::target::ompt::removeBufferManagementFns(int DeviceId) {
std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
- auto BufferMgrItr = BufferManagementFns.find(DeviceId);
- if (BufferMgrItr == BufferManagementFns.end()) {
+ auto BufferMgrItr = getBufferMgmt().find(DeviceId);
+ if (BufferMgrItr == getBufferMgmt().end()) {
REPORT() << "Buffer request and complete functions don't exist for device "
<< DeviceId << "ignoring ...";
return;
}
- BufferManagementFns.erase(BufferMgrItr);
+ getBufferMgmt().erase(BufferMgrItr);
}
bool llvm::omp::target::ompt::isAllDeviceTracingStopped() {
std::unique_lock<std::mutex> Lock(BufferManagementFnMutex);
- return BufferManagementFns.empty();
+ return getBufferMgmt().empty();
}
void llvm::omp::target::ompt::ompt_callback_buffer_request(
@@ -137,11 +145,11 @@ inline bool checkDeviceTracingState(const uint64_t &TracingTypes) {
void llvm::omp::target::ompt::enableDeviceTracing(int DeviceId) {
std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
- auto Device = TracedDevices.find(DeviceId);
- if (Device == TracedDevices.end()) {
+ auto Device = getTracedDevices().find(DeviceId);
+ if (Device == getTracedDevices().end()) {
uint64_t TracingTypes{0};
setDeviceTracing(TracingTypes);
- TracedDevices.emplace(DeviceId, TracingTypes);
+ getTracedDevices().emplace(DeviceId, TracingTypes);
} else
setDeviceTracing(Device->second);
// In any case: at least one device is traced
@@ -150,16 +158,16 @@ void llvm::omp::target::ompt::enableDeviceTracing(int DeviceId) {
void llvm::omp::target::ompt::disableDeviceTracing(int DeviceId) {
std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
- auto Device = TracedDevices.find(DeviceId);
- if (Device == TracedDevices.end()) {
+ auto Device = getTracedDevices().find(DeviceId);
+ if (Device == getTracedDevices().end()) {
uint64_t TracingTypes{0};
resetDeviceTracing(TracingTypes);
- TracedDevices.emplace(DeviceId, TracingTypes);
+ getTracedDevices().emplace(DeviceId, TracingTypes);
} else
resetDeviceTracing(Device->second);
// Check for actively traced devices
- for (auto &Dev : TracedDevices)
+ for (auto &Dev : getTracedDevices())
if (checkDeviceTracingState(Dev.second))
return;
@@ -175,8 +183,8 @@ bool llvm::omp::target::ompt::isTracingEnabled(int DeviceId,
bool llvm::omp::target::ompt::isTracedDevice(int DeviceId) {
std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
- auto Device = TracedDevices.find(DeviceId);
- if (Device != TracedDevices.end())
+ auto Device = getTracedDevices().find(DeviceId);
+ if (Device != getTracedDevices().end())
return checkDeviceTracingState(Device->second);
return false;
@@ -187,8 +195,8 @@ bool llvm::omp::target::ompt::isTracingTypeEnabled(int DeviceId,
std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
// Make sure we do not shift more than std::numeric_limits<uint64_t>::digits
assert(EventTy < 64 && "Shift limit exceeded: EventTy must be less than 64");
- auto Device = TracedDevices.find(DeviceId);
- if (Device != TracedDevices.end() && EventTy < 64)
+ auto Device = getTracedDevices().find(DeviceId);
+ if (Device != getTracedDevices().end() && EventTy < 64)
return (Device->second & (1UL << EventTy));
return false;
}
@@ -198,8 +206,8 @@ bool llvm::omp::target::ompt::isTracingTypeGroupEnabled(int DeviceId,
std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
// Make sure we do not shift more than std::numeric_limits<uint64_t>::digits
assert(EventTy < 64 && "Shift limit exceeded: EventTy must be less than 64");
- auto Device = TracedDevices.find(DeviceId);
- if (Device != TracedDevices.end() && EventTy < 64) {
+ auto Device = getTracedDevices().find(DeviceId);
+ if (Device != getTracedDevices().end() && EventTy < 64) {
auto TracedEvents = Device->second;
switch (EventTy) {
case ompt_callbacks_t::ompt_callback_target:
@@ -252,10 +260,10 @@ llvm::omp::target::ompt::setTraceEventTy(int DeviceId, unsigned int Enable,
<< " Enable=" << Enable << " EventTy=" << EventTy;
std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
- if (TracedDevices.find(DeviceId) == TracedDevices.end())
- TracedDevices.emplace(DeviceId, 0UL);
+ if (getTracedDevices().find(DeviceId) == getTracedDevices().end())
+ getTracedDevices().emplace(DeviceId, 0UL);
- auto &TracedEventTy = TracedDevices[DeviceId];
+ auto &TracedEventTy = getTracedDevices()[DeviceId];
bool Enabled = Enable > 0;
if (EventTy == 0) {
// Set / reset all supported types
diff --git a/offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp b/offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp
index 2d4b32e3554ac..cf93b5f968ac7 100644
--- a/offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp
+++ b/offload/libomptarget/OpenMP/OMPT/OmptTracingBuffer.cpp
@@ -32,8 +32,10 @@ static std::atomic<uint64_t> BufId{0};
// Unique id in buffer flush order
static std::atomic<uint64_t> FlushId{0};
-thread_local OmptTracingBufferMgr::BufPtr
- OmptTracingBufferMgr::ArrayOfBufPtr[MAX_NUM_DEVICES];
+OmptTracingBufferMgr::BufPtr *OmptTracingBufferMgr::getArrayOfBufPtr() {
+ thread_local BufPtr Array[MAX_NUM_DEVICES]{};
+ return Array;
+}
static uint64_t get_and_inc_buf_id() { return BufId++; }
@@ -418,7 +420,7 @@ OmptTracingBufferMgr::getDeviceSpecificBuffer(int64_t DeviceId) {
<< MAX_NUM_DEVICES - 1;
return nullptr;
}
- return ArrayOfBufPtr[DeviceId];
+ return getArrayOfBufPtr()[DeviceId];
}
void OmptTracingBufferMgr::setDeviceSpecificBuffer(int64_t DeviceId,
@@ -429,7 +431,7 @@ void OmptTracingBufferMgr::setDeviceSpecificBuffer(int64_t DeviceId,
<< MAX_NUM_DEVICES - 1;
return;
}
- ArrayOfBufPtr[DeviceId] = Buf;
+ getArrayOfBufPtr()[DeviceId] = Buf;
}
void OmptTracingBufferMgr::setTRStatus(void *Rec, TRStatus Status) {
@@ -681,7 +683,7 @@ void OmptTracingBufferMgr::waitForFlushCompletion() {
void OmptTracingBufferMgr::init() {
for (int i = 0; i < MAX_NUM_DEVICES; ++i)
- ArrayOfBufPtr[i] = nullptr;
+ getArrayOfBufPtr()[i] = nullptr;
ThreadFlushTracker = 0;
ThreadShutdownTracker = 0;
DoneTracing = false; // TODO make it a class member
diff --git a/offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h b/offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h
index 5744a784825da..ba9e1dcd6ccc3 100644
--- a/offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h
+++ b/offload/plugins-nextgen/common/OMPT/OmptDeviceTracing.h
@@ -85,19 +85,17 @@ extern double HostToDeviceSlope;
/// Host to device constant clock offset
extern double HostToDeviceOffset;
-/// Mapping of device pointers to their corresponding RTL device ID
-extern std::map<ompt_device_t *, int32_t> Devices;
+/// Access the map of device pointers to their corresponding RTL device ID
+std::map<ompt_device_t *, int32_t> &getDevices();
-/// Mapping of RTL device IDs to their currently enabled tracing event types.
-/// Note: Event type '0' (bit position) indicates if this device is traced.
-extern std::map<int32_t, uint64_t> TracedDevices;
+/// Access the map of RTL device IDs to their currently enabled tracing event
+/// types. Note: Event type '0' (bit position) indicates if this device is
+/// traced.
+std::map<int32_t, uint64_t> &getTracedDevices();
/// OMPT global tracing status. Indicates if at least one device is traced.
extern bool TracingActive;
-/// Parent library pointer
-extern std::shared_ptr<llvm::sys::DynamicLibrary> ParentLibrary;
-
/// Get the parent library by pointer. If it is not already set, it will set the
/// parent library pointer.
std::shared_ptr<llvm::sys::DynamicLibrary> getParentLibrary();
@@ -111,10 +109,10 @@ void setParentLibrary(const char *Filename);
template <typename FT>
void ensureFuncPtrLoaded(const std::string &FuncName, FT *FuncPtr) {
if (*FuncPtr == nullptr) {
- if ((ParentLibrary == nullptr && getParentLibrary() == nullptr) ||
- !ParentLibrary->isValid())
+ auto Lib = getParentLibrary();
+ if (!Lib || !Lib->isValid())
return;
- void *SymbolPtr = ParentLibrary->getAddressOfSymbol(FuncName.c_str());
+ void *SymbolPtr = Lib->getAddressOfSymbol(FuncName.c_str());
if (SymbolPtr == nullptr)
return;
*FuncPtr = reinterpret_cast<FT>(SymbolPtr);
diff --git a/offload/plugins-nextgen/common/OMPT/OmptTracing.cpp b/offload/plugins-nextgen/common/OMPT/OmptTracing.cpp
index d5be9b8d9cd8a..6e900143c4012 100644
--- a/offload/plugins-nextgen/common/OMPT/OmptTracing.cpp
+++ b/offload/plugins-nextgen/common/OMPT/OmptTracing.cpp
@@ -46,39 +46,44 @@ std::mutex llvm::omp::target::ompt::DeviceIdWritingMutex;
using namespace llvm::omp::target::ompt;
using namespace llvm::omp::target::debug;
-std::shared_ptr<llvm::sys::DynamicLibrary>
- llvm::omp::target::ompt::ParentLibrary(nullptr);
-
double llvm::omp::target::ompt::HostToDeviceSlope = .0;
double llvm::omp::target::ompt::HostToDeviceOffset = .0;
-std::map<ompt_device_t *, int32_t> llvm::omp::target::ompt::Devices;
+// File-local helper to hold the parent library shared_ptr as a function-local
+// static, avoiding a global destructor.
+static std::shared_ptr<llvm::sys::DynamicLibrary> &getParentLibraryStorage() {
+ static std::shared_ptr<llvm::sys::DynamicLibrary> Lib(nullptr);
+ return Lib;
+}
std::shared_ptr<llvm::sys::DynamicLibrary>
llvm::omp::target::ompt::getParentLibrary() {
- static bool ParentLibraryAssigned = false;
- if (!ParentLibraryAssigned) {
+ if (!getParentLibraryStorage()) {
setParentLibrary("libomptarget.so");
- ParentLibraryAssigned = true;
}
- return ParentLibrary;
+ return getParentLibraryStorage();
}
void llvm::omp::target::ompt::setParentLibrary(const char *Filename) {
- if (ParentLibrary)
+ if (getParentLibraryStorage())
return;
std::string ErrorMsg;
- ParentLibrary = std::make_shared<llvm::sys::DynamicLibrary>(
+ getParentLibraryStorage() = std::make_shared<llvm::sys::DynamicLibrary>(
llvm::sys::DynamicLibrary::getPermanentLibrary(Filename, &ErrorMsg));
- if ((ParentLibrary == nullptr) || (!ParentLibrary->isValid()))
+ if (!getParentLibraryStorage() || !getParentLibraryStorage()->isValid())
REPORT() << "Failed to set parent library: " << ErrorMsg.c_str();
}
+std::map<ompt_device_t *, int32_t> &llvm::omp::target::ompt::getDevices() {
+ static std::map<ompt_device_t *, int32_t> Devices;
+ return Devices;
+}
+
int llvm::omp::target::ompt::getDeviceId(ompt_device_t *Device) {
// Block other threads, which might trigger an erase (for the same device)
std::unique_lock<std::mutex> Lock(DeviceIdWritingMutex);
- auto DeviceIterator = Devices.find(Device);
- if (Device == nullptr || DeviceIterator == Devices.end()) {
+ auto DeviceIterator = getDevices().find(Device);
+ if (Device == nullptr || DeviceIterator == getDevices().end()) {
REPORT() << "Failed to get ID for Device=" << Device;
return -1;
}
@@ -93,8 +98,8 @@ void llvm::omp::target::ompt::setDeviceId(ompt_device_t *Device,
return;
}
std::unique_lock<std::mutex> Lock(DeviceIdWritingMutex);
- auto DeviceIterator = Devices.find(Device);
- if (DeviceIterator != Devices.end()) {
+ auto DeviceIterator = getDevices().find(Device);
+ if (DeviceIterator != getDevices().end()) {
auto CurrentDeviceId = DeviceIterator->second;
if (DeviceId == CurrentDeviceId) {
REPORT() << "Tried to duplicate OMPT Device= " << Device << " ID=" << DeviceId;
@@ -103,7 +108,7 @@ void llvm::omp::target::ompt::setDeviceId(ompt_device_t *Device,
}
return;
}
- Devices.emplace(Device, DeviceId);
+ getDevices().emplace(Device, DeviceId);
}
void llvm::omp::target::ompt::removeDeviceId(ompt_device_t *Device) {
@@ -113,8 +118,8 @@ void llvm::omp::target::ompt::removeDeviceId(ompt_device_t *Device) {
return;
}
std::unique_lock<std::mutex> Lock(DeviceIdWritingMutex);
- Devices.erase(Device);
- TracedDevices.erase(DeviceId);
+ getDevices().erase(Device);
+ getTracedDevices().erase(DeviceId);
}
OMPT_API_ROUTINE ompt_set_result_t ompt_set_trace_ompt(ompt_device_t *Device,
>From 37e1a9a4333caeddad42285e6cfd8e239ddceac0 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Wed, 22 Apr 2026 08:39:12 -0500
Subject: [PATCH 09/15] WIP: Working towards fixing bugs
---
offload/libomptarget/CMakeLists.txt | 13 +++++++++++++
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 14 +++++++++++++-
offload/plugins-nextgen/common/CMakeLists.txt | 6 +++++-
3 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/offload/libomptarget/CMakeLists.txt b/offload/libomptarget/CMakeLists.txt
index db75d72802441..7f2ff7329a536 100644
--- a/offload/libomptarget/CMakeLists.txt
+++ b/offload/libomptarget/CMakeLists.txt
@@ -50,6 +50,19 @@ endforeach()
target_compile_options(omptarget PRIVATE ${offload_compile_flags})
target_link_options(omptarget PRIVATE ${offload_link_flags})
if (OMPT_TARGET_DEFAULT AND LIBOMPTARGET_OMPT_SUPPORT)
+ # OmptProfiler.cpp provides the strong getProfilerToAttach() that routes
+ # plugin device events through OmptProfilerTy. It must be compiled directly
+ # into omptarget so its strong definition unconditionally overrides the weak
+ # fallback in GenericProfiler.cpp (which is embedded into each plugin archive
+ # via the PluginCommon OBJECT library and would otherwise win the link).
+ target_sources(omptarget PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/../plugins-nextgen/common/OMPT/OmptProfiler.cpp
+ )
+ target_include_directories(omptarget PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/../plugins-nextgen/common/OMPT
+ ${CMAKE_CURRENT_SOURCE_DIR}/../plugins-nextgen/common/include
+ )
+ # PluginOmpt provides OmptTracing.cpp (plugin-side GPU tracing).
target_link_libraries(omptarget PRIVATE PluginOmpt)
endif()
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 460afa94f14c1..331b16cef852f 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -153,7 +153,8 @@ static double TicksToTime = 1.0;
static void setHSATicksToTimeConstant() { TicksToTime = setTicksToTime(); }
/// Get the current HSA-based system timestamp in nanoseconds.
-static uint64_t getSystemTimestampInNs() {
+/// Called by OmptTracing.cpp (from PluginOmpt) for device time queries.
+uint64_t getSystemTimestampInNs() {
uint64_t TimeStamp = 0;
hsa_status_t Status =
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &TimeStamp);
@@ -162,6 +163,17 @@ static uint64_t getSystemTimestampInNs() {
return TimeStamp;
}
+/// Enable or disable HSA async copy profiling for OMPT device tracing.
+/// Called by OmptTracing.cpp when a tool activates/deactivates device tracing.
+/// Full HSA profiling integration (enabling per-copy timing signals) will be
+/// wired up in a follow-on commit.
+void setOmptAsyncCopyProfile(bool Enable) {}
+
+/// Enable or disable HSA queue kernel profiling for OMPT device tracing.
+/// Called by OmptTracing.cpp when a tool activates/deactivates device tracing.
+/// Full HSA queue profiling integration will be wired up in a follow-on commit.
+void setGlobalOmptKernelProfile(void *Device, int Enable) {}
+
namespace llvm {
namespace omp {
namespace target {
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index a5154b97c5d0b..a717deb23e5e5 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -38,7 +38,11 @@ include(FindLibcCommonUtils)
target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
if (OMPT_TARGET_DEFAULT AND LIBOMPTARGET_OMPT_SUPPORT)
- add_library(PluginOmpt STATIC OMPT/OmptTracing.cpp OMPT/OmptProfiler.cpp)
+ # OmptTracing.cpp calls plugin-specific symbols (HSA profiling etc.) and
+ # is linked per-plugin. OmptProfiler.cpp is added directly to omptarget
+ # in offload/libomptarget/CMakeLists.txt to ensure its strong
+ # getProfilerToAttach() override is unconditionally included.
+ add_library(PluginOmpt STATIC OMPT/OmptTracing.cpp)
target_include_directories(PluginOmpt PUBLIC
OMPT
${CMAKE_CURRENT_SOURCE_DIR}/include
>From 482cbf1221cada8452dba8c95d1da3e3f414acbf Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 21 May 2026 08:43:39 -0500
Subject: [PATCH 10/15] [OpenMP] Add OMPT device tracing tests
---
.../callbacks.h | 340 ++++++++++++++++++
.../veccopy_ompt_selective_tracing_dataop.cpp | 66 ++++
.../callbacks.h | 340 ++++++++++++++++++
.../veccopy_ompt_selective_tracing_kernel.cpp | 70 ++++
.../callbacks.h | 332 +++++++++++++++++
.../veccopy-ompt-target-data-tracing-emi.cpp | 146 ++++++++
.../callbacks.h | 333 +++++++++++++++++
.../veccopy-ompt-target-emi-tracing-dag.cpp | 81 +++++
.../callbacks.h | 332 +++++++++++++++++
.../veccopy-ompt-target-emi-tracing.cpp | 78 ++++
.../callbacks.h | 332 +++++++++++++++++
.../veccopy-ompt-target-emi-tracing.cpp | 67 ++++
.../callbacks.h | 332 +++++++++++++++++
.../veccopy-ompt-target-emi-tracing.cpp | 66 ++++
.../veccopy-ompt-target-tracing/callbacks.h | 292 +++++++++++++++
.../veccopy-ompt-target-tracing.cpp | 78 ++++
16 files changed, 3285 insertions(+)
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/veccopy_ompt_selective_tracing_dataop.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/veccopy_ompt_selective_tracing_kernel.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/veccopy-ompt-target-data-tracing-emi.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/veccopy-ompt-target-emi-tracing-dag.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/veccopy-ompt-target-emi-tracing.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/veccopy-ompt-target-emi-tracing.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/veccopy-ompt-target-emi-tracing.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-tracing/veccopy-ompt-target-tracing.cpp
diff --git a/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/callbacks.h
new file mode 100644
index 0000000000000..5cd611990d32b
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/callbacks.h
@@ -0,0 +1,340 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+ // Only enable device tracing for dataop events explicitly
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Callback Load: device_num:%d filename:%s host_adddr:%p device_addr:%p"
+ " bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+ assert(DeviceMapPtr != nullptr && "DeviceMapPtr must be valid");
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_data_op_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
+ ompt_data_t *target_data, ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype, void *src_addr, int src_device_num,
+ void *dest_addr, int dest_device_num, size_t bytes,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+ "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
+ "src_device_num=%d dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint, optype, target_task_data, target_task_data_value,
+ target_data, target_data->value, host_op_id, *host_op_id, src_addr,
+ src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu "
+ "code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_emi(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+ "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num, task_data, task_data->value,
+ target_task_data, target_task_data_value, target_data,
+ target_data->value, codeptr_ra);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_submit_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
+ ompt_id_t *host_op_id, unsigned int requested_num_teams) {
+ printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p "
+ "(0x%lx) host_op_id=%p (0x%lx)\n",
+ endpoint, requested_num_teams, target_data, target_data->value,
+ host_op_id, *host_op_id);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+#if EMI
+ register_ompt_callback(ompt_callback_target_data_op_emi);
+ register_ompt_callback(ompt_callback_target_emi);
+ register_ompt_callback(ompt_callback_target_submit_emi);
+#else
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+#endif
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/veccopy_ompt_selective_tracing_dataop.cpp b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/veccopy_ompt_selective_tracing_dataop.cpp
new file mode 100644
index 0000000000000..c07bff4230688
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-dataop/veccopy_ompt_selective_tracing_dataop.cpp
@@ -0,0 +1,66 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
+/// CHECK: rec={{0x[0-9a-fA-F]+}} type=9
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=10
+/// CHECK: rec={{0x[0-9a-fA-F]+}} type=9
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
+
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
+/// CHECK: rec={{0x[0-9a-fA-F]+}} type=9
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=10
+/// CHECK: rec={{0x[0-9a-fA-F]+}} type=9
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
diff --git a/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/callbacks.h
new file mode 100644
index 0000000000000..9655d718c7c48
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/callbacks.h
@@ -0,0 +1,340 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+ // Only enable device tracing for kernel submit events
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Callback Load: device_num:%d filename:%s host_adddr:%p device_addr:%p"
+ " bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+ assert(DeviceMapPtr != nullptr && "DeviceMapPtr must be valid");
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_data_op_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
+ ompt_data_t *target_data, ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype, void *src_addr, int src_device_num,
+ void *dest_addr, int dest_device_num, size_t bytes,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+ "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
+ "src_device_num=%d dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint, optype, target_task_data, target_task_data_value,
+ target_data, target_data->value, host_op_id, *host_op_id, src_addr,
+ src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu "
+ "code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_emi(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+ "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num, task_data, task_data->value,
+ target_task_data, target_task_data_value, target_data,
+ target_data->value, codeptr_ra);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+static void on_ompt_callback_target_submit_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
+ ompt_id_t *host_op_id, unsigned int requested_num_teams) {
+ printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p "
+ "(0x%lx) host_op_id=%p (0x%lx)\n",
+ endpoint, requested_num_teams, target_data, target_data->value,
+ host_op_id, *host_op_id);
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+#if EMI
+ register_ompt_callback(ompt_callback_target_data_op_emi);
+ register_ompt_callback(ompt_callback_target_emi);
+ register_ompt_callback(ompt_callback_target_submit_emi);
+#else
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+#endif
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/veccopy_ompt_selective_tracing_kernel.cpp b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/veccopy_ompt_selective_tracing_kernel.cpp
new file mode 100644
index 0000000000000..285f1f1011268
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-selective-tracing-kernel/veccopy_ompt_selective_tracing_kernel.cpp
@@ -0,0 +1,70 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// XFAIL: *
+// clang-format on
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ for (auto Device : *DeviceMapPtr)
+ flush_trace(Device);
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=9
+/// CHECK: rec={{0x[0-9a-fA-F]+}} type=10
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=9
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
+
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=9
+/// CHECK: rec={{0x[0-9a-fA-F]+}} type=10
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=8
+/// CHECK-NOT: rec={{0x[0-9a-fA-F]+}} type=9
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/callbacks.h
new file mode 100644
index 0000000000000..d05f05a4a4501
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/callbacks.h
@@ -0,0 +1,332 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+#define EMI 1
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Callback Load: device_num:%d filename:%s host_adddr:%p device_addr:%p"
+ " bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target_data_op_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
+ ompt_data_t *target_data, ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype, void *src_addr, int src_device_num,
+ void *dest_addr, int dest_device_num, size_t bytes,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+ "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
+ "src_device_num=%d dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint, optype, target_task_data, target_task_data_value,
+ target_data, target_data->value, host_op_id, *host_op_id, src_addr,
+ src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu "
+ "code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+}
+
+static void on_ompt_callback_target_emi(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+ "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num, task_data, task_data->value,
+ target_task_data, target_task_data_value, target_data,
+ target_data->value, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+static void on_ompt_callback_target_submit_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
+ ompt_id_t *host_op_id, unsigned int requested_num_teams) {
+ printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p "
+ "(0x%lx) host_op_id=%p (0x%lx)\n",
+ endpoint, requested_num_teams, target_data, target_data->value,
+ host_op_id, *host_op_id);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+#if EMI
+ register_ompt_callback(ompt_callback_target_data_op_emi);
+ register_ompt_callback(ompt_callback_target_emi);
+ register_ompt_callback(ompt_callback_target_submit_emi);
+#else
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+#endif
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/veccopy-ompt-target-data-tracing-emi.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/veccopy-ompt-target-data-tracing-emi.cpp
new file mode 100644
index 0000000000000..35a38f4eba739
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-data-tracing-emi/veccopy-ompt-target-data-tracing-emi.cpp
@@ -0,0 +1,146 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+#define N 100000
+
+#pragma omp declare target
+int c[N];
+#pragma omp end declare target
+
+int main() {
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+ for (i = 0; i < N; i++)
+ c[i] = 0;
+
+#pragma omp target enter data map(to : a)
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+#pragma omp target exit data map(from : a)
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+#pragma omp target parallel for map(alloc : c)
+ {
+ for (int j = 0; j < N; j++)
+ c[j] = 2 * j + 1;
+ }
+#pragma omp target update from(c) nowait
+#pragma omp barrier
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+ int rc = 0;
+ for (i = 0; i < N; i++) {
+ if (a[i] != i) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+ }
+
+ for (i = 0; i < N; i++) {
+ if (c[i] != 2 * i + 1) {
+ rc++;
+ printf("Wrong value: c[%d]=%d\n", i, c[i]);
+ }
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+// > OMPT device callback related checks below. <
+
+/// CHECK-NOT: Callback Target EMI:
+/// CHECK-NOT: device_num=-1
+
+/// CHECK: Callback Init:
+/// CHECK: Callback Load:
+
+/// CHECK-DAG: Callback Target EMI: kind=2 endpoint=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK-DAG: Callback Target EMI: kind=2 endpoint=2
+
+/// CHECK-DAG: Callback Target EMI: kind=1 endpoint=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK-DAG: Callback Submit EMI: endpoint=1 req_num_teams=1
+/// CHECK-DAG: Callback Submit EMI: endpoint=2 req_num_teams=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK-DAG: Callback Target EMI: kind=1 endpoint=2
+/// CHECK-DAG: Callback Target EMI: kind=3 endpoint=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK-DAG: Callback Target EMI: kind=3 endpoint=2
+
+/// CHECK-DAG: Callback Target EMI: kind=1 endpoint=1
+/// CHECK-DAG: Callback Submit EMI: endpoint=1 req_num_teams=1
+/// CHECK-DAG: Callback Submit EMI: endpoint=2 req_num_teams=1
+/// CHECK-DAG: Callback Target EMI: kind=1 endpoint=2
+/// CHECK-DAG: Callback Target EMI: kind=4 endpoint=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK-DAG: Callback Target EMI: kind=4 endpoint=2
+
+/// CHECK-DAG: Success
+/// CHECK-DAG: Callback Fini:
+
+// > OMPT device tracing related checks below. <
+
+
+
+// Note: Split checks for record address and content. That way we do not imply
+// any order. Records 01-06 and 12-17 occur interleaved and belong to the
+// first target region. 07-11 occur interleaved with 18-22 and belong to
+// the second target region.
+
+// Note: These addresses will only occur once. They are only captured to
+// indicate their existence.
+
+/// CHECK-DAG: type=8 (Target task)
+/// CHECK-DAG: type=9 (Target data op)
+
+// Note: ADDRX_11 may not trigger a final callback.
+
+// Note: ADDRX_11 may not be deallocated.
+
+/// CHECK-NOT: host_op_id=0x0
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/callbacks.h
new file mode 100644
index 0000000000000..f5724fa7a6852
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/callbacks.h
@@ -0,0 +1,333 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// Enable / disable OMPT's 'External Monitoring Interface'
+#define EMI 1
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Callback Load: device_num:%d filename:%s host_adddr:%p device_addr:%p"
+ " bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target_data_op_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
+ ompt_data_t *target_data, ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype, void *src_addr, int src_device_num,
+ void *dest_addr, int dest_device_num, size_t bytes,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+ "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
+ "src_device_num=%d dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint, optype, target_task_data, target_task_data_value,
+ target_data, target_data->value, host_op_id, *host_op_id, src_addr,
+ src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu "
+ "code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+}
+
+static void on_ompt_callback_target_emi(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+ "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num, task_data, task_data->value,
+ target_task_data, target_task_data_value, target_data,
+ target_data->value, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+static void on_ompt_callback_target_submit_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
+ ompt_id_t *host_op_id, unsigned int requested_num_teams) {
+ printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p "
+ "(0x%lx) host_op_id=%p (0x%lx)\n",
+ endpoint, requested_num_teams, target_data, target_data->value,
+ host_op_id, *host_op_id);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+#if EMI
+ register_ompt_callback(ompt_callback_target_data_op_emi);
+ register_ompt_callback(ompt_callback_target_emi);
+ register_ompt_callback(ompt_callback_target_submit_emi);
+#else
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+#endif
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/veccopy-ompt-target-emi-tracing-dag.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/veccopy-ompt-target-emi-tracing-dag.cpp
new file mode 100644
index 0000000000000..0fdae19017ab2
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-emi-tracing-dag/veccopy-ompt-target-emi-tracing-dag.cpp
@@ -0,0 +1,81 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+/// CHECK-NOT: host_op_id=0x0
+
+
+
+// Note: Split checks for record address and content. That way we do not imply
+// any order. Records 01-06 and 12-17 occur interleaved and belong to the
+// first target region. 07-11 occur interleaved with 18-22 and belong to
+// the second target region.
+
+/// CHECK-DAG: type=8 (Target task)
+/// CHECK-DAG: type=9 (Target data op)
+
+// Note: These addresses will only occur once. They are only captured to
+// indicate their existence.
+
+
+// Note: ADDRX_11 may not trigger a final callback.
+
+// Note: ADDRX_11 may not be deallocated.
+
+/// CHECK-DAG: Success
+
+/// CHECK-NOT: host_op_id=0x0
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/callbacks.h
new file mode 100644
index 0000000000000..d05f05a4a4501
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/callbacks.h
@@ -0,0 +1,332 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+#define EMI 1
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Callback Load: device_num:%d filename:%s host_adddr:%p device_addr:%p"
+ " bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target_data_op_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
+ ompt_data_t *target_data, ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype, void *src_addr, int src_device_num,
+ void *dest_addr, int dest_device_num, size_t bytes,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+ "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
+ "src_device_num=%d dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint, optype, target_task_data, target_task_data_value,
+ target_data, target_data->value, host_op_id, *host_op_id, src_addr,
+ src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu "
+ "code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+}
+
+static void on_ompt_callback_target_emi(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+ "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num, task_data, task_data->value,
+ target_task_data, target_task_data_value, target_data,
+ target_data->value, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+static void on_ompt_callback_target_submit_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
+ ompt_id_t *host_op_id, unsigned int requested_num_teams) {
+ printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p "
+ "(0x%lx) host_op_id=%p (0x%lx)\n",
+ endpoint, requested_num_teams, target_data, target_data->value,
+ host_op_id, *host_op_id);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+#if EMI
+ register_ompt_callback(ompt_callback_target_data_op_emi);
+ register_ompt_callback(ompt_callback_target_emi);
+ register_ompt_callback(ompt_callback_target_submit_emi);
+#else
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+#endif
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/veccopy-ompt-target-emi-tracing.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/veccopy-ompt-target-emi-tracing.cpp
new file mode 100644
index 0000000000000..e682e9b20bfd8
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-api-use/veccopy-ompt-target-emi-tracing.cpp
@@ -0,0 +1,78 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_OMPT_FLUSH_ON_SHUTDOWN=false LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL=false %libomptarget-run-generic | %fcheck-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+/*
+ * This test is run with LIBOMPTARGET_OMPT_FLUSH_ON_SHUTDOWN=false and
+ * LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL=false while doing explicit flushing
+ * using ompt_flush_trace. The intention is to check whether trace records are
+ * properly flushed when the program/tool uses the ompt_flush_trace API. There
+ * should be 23 trace records returned to the tool.
+ */
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+/// CHECK-NOT: host_op_id=0x0
+
+/// CHECK: rec=
+
+/// CHECK-DAG: Success
+
+/// The user calls flush before printing success, so
+/// no more records should be returned here.
+
+/// CHECK-NOT: host_op_id=0x0
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/callbacks.h
new file mode 100644
index 0000000000000..d05f05a4a4501
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/callbacks.h
@@ -0,0 +1,332 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+#define EMI 1
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Callback Load: device_num:%d filename:%s host_adddr:%p device_addr:%p"
+ " bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target_data_op_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
+ ompt_data_t *target_data, ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype, void *src_addr, int src_device_num,
+ void *dest_addr, int dest_device_num, size_t bytes,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+ "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
+ "src_device_num=%d dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint, optype, target_task_data, target_task_data_value,
+ target_data, target_data->value, host_op_id, *host_op_id, src_addr,
+ src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu "
+ "code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+}
+
+static void on_ompt_callback_target_emi(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+ "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num, task_data, task_data->value,
+ target_task_data, target_task_data_value, target_data,
+ target_data->value, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+static void on_ompt_callback_target_submit_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
+ ompt_id_t *host_op_id, unsigned int requested_num_teams) {
+ printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p "
+ "(0x%lx) host_op_id=%p (0x%lx)\n",
+ endpoint, requested_num_teams, target_data, target_data->value,
+ host_op_id, *host_op_id);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+#if EMI
+ register_ompt_callback(ompt_callback_target_data_op_emi);
+ register_ompt_callback(ompt_callback_target_emi);
+ register_ompt_callback(ompt_callback_target_submit_emi);
+#else
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+#endif
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/veccopy-ompt-target-emi-tracing.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/veccopy-ompt-target-emi-tracing.cpp
new file mode 100644
index 0000000000000..abf2426d2eee9
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-buffer-full/veccopy-ompt-target-emi-tracing.cpp
@@ -0,0 +1,67 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_OMPT_FLUSH_ON_SHUTDOWN=false %libomptarget-run-generic | %fcheck-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+/*
+ * This test is run with LIBOMPTARGET_OMPT_FLUSH_ON_SHUTDOWN=false and
+ * ompt_flush_trace is not invoked by the user/tool. The intention is to check
+ * whether trace records are properly flushed as a buffer fills up. Currently,
+ * the buffer size in callbacks.h implies that every buffer holds 2 trace
+ * records. With this assumption, there should be 22 trace records. The last
+ * trace record is not flushed because the last buffer is not yet full.
+ */
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+/// CHECK-NOT: host_op_id=0x0
+
+/// CHECK: rec=
+/// CHECK-NOT: host_op_id=0x0
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/callbacks.h
new file mode 100644
index 0000000000000..d05f05a4a4501
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/callbacks.h
@@ -0,0 +1,332 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+#define EMI 1
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Callback Load: device_num:%d filename:%s host_adddr:%p device_addr:%p"
+ " bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target_data_op_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_task_data,
+ ompt_data_t *target_data, ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype, void *src_addr, int src_device_num,
+ void *dest_addr, int dest_device_num, size_t bytes,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+ "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
+ "src_device_num=%d dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint, optype, target_task_data, target_task_data_value,
+ target_data, target_data->value, host_op_id, *host_op_id, src_addr,
+ src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu "
+ "code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+}
+
+static void on_ompt_callback_target_emi(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra) {
+ if (endpoint == ompt_scope_begin)
+ target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value =
+ (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+ "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num, task_data, task_data->value,
+ target_task_data, target_task_data_value, target_data,
+ target_data->value, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+static void on_ompt_callback_target_submit_emi(
+ ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
+ ompt_id_t *host_op_id, unsigned int requested_num_teams) {
+ printf(" Callback Submit EMI: endpoint=%d req_num_teams=%d target_data=%p "
+ "(0x%lx) host_op_id=%p (0x%lx)\n",
+ endpoint, requested_num_teams, target_data, target_data->value,
+ host_op_id, *host_op_id);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+#if EMI
+ register_ompt_callback(ompt_callback_target_data_op_emi);
+ register_ompt_callback(ompt_callback_target_emi);
+ register_ompt_callback(ompt_callback_target_submit_emi);
+#else
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+#endif
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/veccopy-ompt-target-emi-tracing.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/veccopy-ompt-target-emi-tracing.cpp
new file mode 100644
index 0000000000000..7f4e99b83e4e3
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing-flush-only-on-shutdown/veccopy-ompt-target-emi-tracing.cpp
@@ -0,0 +1,66 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL=false %libomptarget-run-generic | %fcheck-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+/*
+ * This test is run with LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL=false and
+ * ompt_flush_trace is not invoked by the user/tool. The intention is to check
+ * whether trace records are properly flushed on shutdown. 23 trace records
+ * should be flushed during shutdown.
+ */
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+/// CHECK-NOT: host_op_id=0x0
+
+/// CHECK: rec=
+
+/// CHECK-NOT: host_op_id=0x0
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-tracing/callbacks.h
new file mode 100644
index 0000000000000..cfb5fd4147ebf
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing/callbacks.h
@@ -0,0 +1,292 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+//
+// SPDX-License-Identifier: MIT
+
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// Enable / disable OMPT's 'External Monitoring Interface'
+#define EMI 0
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE (4 * 1024)
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+static ompt_get_record_type_t ompt_get_record_type_fn = 0;
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes,
+ *buffer);
+}
+
+// Note: This callback must handle a null begin cursor. Currently,
+// ompt_get_record_ompt, print_record_ompt, and
+// ompt_advance_buffer_cursor handle a null cursor.
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n", device_num,
+ buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+
+ if (ompt_get_record_type_fn(buffer, current) != ompt_record_ompt) {
+ printf("WARNING: received non-ompt type buffer object\n");
+ }
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n", device_num,
+ type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ ompt_get_record_type_fn =
+ (ompt_get_record_type_t)lookup("ompt_get_record_type");
+ if (!ompt_get_record_type_fn) {
+ printf("WARNING: No function ompt_get_record_type found in device "
+ "callbacks\n");
+ }
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Load: device_num:%d filename:%s host_adddr:%p device_addr:%p "
+ "bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p "
+ "src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d "
+ "code=%p\n",
+ target_id, kind, endpoint, device_num, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-tracing/veccopy-ompt-target-tracing.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-tracing/veccopy-ompt-target-tracing.cpp
new file mode 100644
index 0000000000000..d393bc63b2e14
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-tracing/veccopy-ompt-target-tracing.cpp
@@ -0,0 +1,78 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+//
+// SPDX-License-Identifier: MIT
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+// > OMPT device tracing related checks below. <
+
+// Note: This test will allocate one buffer, big enough to hold all trace
+// records, hence there will be only one allocation.
+
+
+// Note: Split checks for record address and content. That way we do not imply
+// any order. Records may / will occur interleaved.
+
+// Note: These addresses will only occur once. They are only captured to
+// indicate their existence.
+
+/// CHECK-DAG: type=8 (Target task)
+/// CHECK-DAG: type=9 (Target data op)
+
+// Note: ADDRX_01 may not trigger a final callback.
+// Note: ADDRX_01 may not be deallocated.
+
+/// CHECK-NOT: host_op_id=0x0
>From fd5064b5d52daccdf558aac5d5e5d1936a816c54 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 21 May 2026 08:44:24 -0500
Subject: [PATCH 11/15] [Openmp] Add OMPT device tracing support tests
These tests concern functionality used by parts of the OMPT device
tracing implementation.
---
.../veccopy-ompt-ctor-1/Inputs/callbacks.h | 373 ++++++++++++++++++
.../Inputs/veccopy-ctor-1.cpp | 37 ++
.../tracing/veccopy-ompt-ctor-1/veccopy.cpp | 52 +++
.../callbacks.h | 286 ++++++++++++++
.../veccopy-ompt-target-default-device.cpp | 86 ++++
.../veccopy-ompt-target-devices/callbacks.h | 324 +++++++++++++++
.../veccopy-ompt-target-devices.cpp | 86 ++++
.../callbacks.h | 166 ++++++++
.../veccopy-ompt-target.c | 78 ++++
.../callbacks.h | 160 ++++++++
.../veccopy-ompt-target.c | 78 ++++
11 files changed, 1726 insertions(+)
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/veccopy-ctor-1.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-ctor-1/veccopy.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-default-device/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-default-device/veccopy-ompt-target-default-device.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-devices/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-devices/veccopy-ompt-target-devices.cpp
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-translate-time/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-translate-time/veccopy-ompt-target.c
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/callbacks.h
create mode 100644 offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/veccopy-ompt-target.c
diff --git a/offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/callbacks.h
new file mode 100644
index 0000000000000..d206c46567a3e
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/callbacks.h
@@ -0,0 +1,373 @@
+#include <assert.h>
+#include <memory>
+#include <unordered_set>
+
+#define EMI 1
+
+// Tool related code below
+#include <omp-tools.h>
+
+ompt_id_t next_op_id = 0x8000000000000001;
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t*> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL) return;
+
+ printf("rec=%p type=%2d time=%lu thread_id=%lu target_id=%lu\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id);
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi:
+ {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("\tRecord Target task: target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu codeptr=%p\n",
+ target_rec.target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi:
+ {
+ ompt_record_target_data_op_t target_data_op_rec = rec->record.target_data_op;
+ printf("\t Record Target data op: target_id=0x%lx host_op_id=0x%lx optype=%d src_addr=%p src_device=%d "
+ "dest_addr=%p dest_device=%d bytes=%lu end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi:
+ {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("\t Record Target kernel: target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u granted_num_teams=%u "
+ "end_time=%lu duration=%lu ns\n",
+ rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request (
+ int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes
+) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback\n", *bytes, *buffer);
+}
+
+static void on_ompt_callback_buffer_complete (
+ int device_num,
+ ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin,
+ int buffer_owned
+) {
+ printf("Executing buffer complete callback: %d %p %lu %p %d\n",
+ device_num, buffer, bytes, (void*)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer,
+ bytes,
+ current,
+ ¤t);
+ }
+ if (buffer_owned) delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt) return ompt_set_error;
+
+#if EMI
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op_emi);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit_emi);
+#else
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, 1, ompt_callback_target_submit);
+#endif
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace) return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace) return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace) return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize
+(
+ int device_num,
+ const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation
+ ) {
+ printf("Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t) lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t) lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t) lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t) lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t) lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor = (ompt_advance_buffer_cursor_t) lookup("ompt_advance_buffer_cursor");
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ // In many scenarios, this will be a good place to start the
+ // trace. If start_trace is called from the main program before this
+ // callback is dispatched, the start_trace handle will be null. This
+ // is because this device_init callback is invoked during the first
+ // target construct implementation.
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_load
+ (
+ int device_num,
+ const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file,
+ size_t bytes,
+ void *host_addr,
+ void *device_addr,
+ uint64_t module_id
+ ) {
+ printf("Load: device_num:%d filename:%s host_adddr:%p device_addr:%p bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+
+static void on_ompt_callback_target_data_op
+ (
+ ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ ompt_target_data_op_t optype,
+ void *src_addr,
+ int src_device_num,
+ void *dest_addr,
+ int dest_device_num,
+ size_t bytes,
+ const void *codeptr_ra
+ ) {
+ printf(" Callback DataOp: host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ host_op_id, optype, src_addr, src_device_num,
+ dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+
+static void on_ompt_callback_target_data_op_emi
+ (
+ ompt_scope_endpoint_t endpoint,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ ompt_id_t *host_op_id,
+ ompt_target_data_op_t optype,
+ void *src_addr,
+ int src_device_num,
+ void *dest_addr,
+ int dest_device_num,
+ size_t bytes,
+ const void *codeptr_ra
+ ) {
+ if (endpoint == ompt_scope_begin) *host_op_id = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value = (target_task_data) ? target_task_data->value : 0;
+ printf(" Callback DataOp EMI: endpoint=%d target_task_data=%p (0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ endpoint,
+ target_task_data, target_task_data_value,
+ target_data, target_data->value,
+ host_op_id, *host_op_id,
+ optype, src_addr, src_device_num,
+ dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+
+static void on_ompt_callback_target
+ (
+ ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num,
+ ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra
+ ) {
+ printf("Callback Target: kind=%d endpoint=%d device_num=%d target_id=%lu code=%p\n",
+ kind, endpoint, device_num, target_id, codeptr_ra);
+}
+
+
+static void on_ompt_callback_target_emi
+ (
+ ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num,
+ ompt_data_t *task_data,
+ ompt_data_t *target_task_data,
+ ompt_data_t *target_data,
+ const void *codeptr_ra
+ ) {
+ if (endpoint == ompt_scope_begin) target_data->value = next_op_id++;
+ // target_task_data may be null, avoid dereferencing it
+ uint64_t target_task_data_value = (target_task_data) ? target_task_data->value : 0;
+ printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p (0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
+ kind, endpoint, device_num,
+ task_data, task_data->value,
+ target_task_data, target_task_data_value,
+ target_data, target_data->value,
+ codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit
+ (
+ ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams
+ ) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+
+static void on_ompt_callback_target_submit_emi
+ (
+ ompt_scope_endpoint_t endpoint,
+ ompt_data_t *target_data,
+ ompt_id_t *host_op_id,
+ unsigned int requested_num_teams
+ ) {
+ printf(" Callback Submit EMI: endpoint=%d target_data=%p (0x%lx) host_op_id=%p (0x%lx) req_num_teams=%d\n",
+ endpoint,
+ target_data, target_data->value,
+ host_op_id, *host_op_id,
+ requested_num_teams);
+}
+
+// Init functions
+int ompt_initialize(
+ ompt_function_lookup_t lookup,
+ int initial_device_num,
+ ompt_data_t *tool_data)
+{
+ ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+
+ ompt_set_callback(ompt_callback_device_initialize,
+ (ompt_callback_t)&on_ompt_callback_device_initialize);
+ ompt_set_callback(ompt_callback_device_load,
+ (ompt_callback_t)&on_ompt_callback_device_load);
+#if EMI
+ ompt_set_callback(ompt_callback_target_submit_emi,
+ (ompt_callback_t)&on_ompt_callback_target_submit_emi);
+ ompt_set_callback(ompt_callback_target_data_op_emi,
+ (ompt_callback_t)&on_ompt_callback_target_data_op_emi);
+ ompt_set_callback(ompt_callback_target_emi,
+ (ompt_callback_t)&on_ompt_callback_target_emi);
+#else
+ ompt_set_callback(ompt_callback_target_submit,
+ (ompt_callback_t)&on_ompt_callback_target_submit);
+ ompt_set_callback(ompt_callback_target_data_op,
+ (ompt_callback_t)&on_ompt_callback_target_data_op);
+ ompt_set_callback(ompt_callback_target,
+ (ompt_callback_t)&on_ompt_callback_target);
+#endif
+ return 1; //success
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(
+ unsigned int omp_version,
+ const char *runtime_version)
+{
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
+
+
diff --git a/offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/veccopy-ctor-1.cpp b/offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/veccopy-ctor-1.cpp
new file mode 100644
index 0000000000000..c944d0b5831fa
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-ctor-1/Inputs/veccopy-ctor-1.cpp
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <omp.h>
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int status;
+
+__attribute__((constructor(101))) void veccopy_init() {
+ int N = 10;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i=0; i<N; i++)
+ a[i]=0;
+
+ for (i=0; i<N; i++)
+ b[i]=i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j< N; j++)
+ a[j]=b[j];
+ }
+
+ if (a[5] != b[5])
+ status = 1;
+ else
+ status = 0;
+}
+
+
diff --git a/offload/test/ompt/tracing/veccopy-ompt-ctor-1/veccopy.cpp b/offload/test/ompt/tracing/veccopy-ompt-ctor-1/veccopy.cpp
new file mode 100644
index 0000000000000..3d6533ad45f0e
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-ctor-1/veccopy.cpp
@@ -0,0 +1,52 @@
+// clang-format off
+// RUN: %clangxx-generic %s %S/Inputs/veccopy-ctor-1.cpp -o %t -Xoffload-linker -lompdevice && %libomptarget-run-generic | %fcheck-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+/*
+ * Test whether OMPT support works for OpenMP directives
+ * in a global constructor without using a shared library.
+ * This currently succeeds.
+ */
+
+#include <stdio.h>
+#include <omp.h>
+
+extern int status;
+
+int main()
+{
+ int N = 10;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i=0; i<N; i++)
+ a[i]=0;
+
+ for (i=0; i<N; i++)
+ b[i]=i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j< N; j++)
+ a[j]=b[j];
+ }
+
+ int rc = 0;
+ for (i=0; i<N; i++)
+ if (a[i] != b[i] ) {
+ rc++;
+ printf ("Wrong varlue: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc && !status)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+/// CHECK: Record Target task
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-default-device/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-default-device/callbacks.h
new file mode 100644
index 0000000000000..5df9985ced0c3
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-default-device/callbacks.h
@@ -0,0 +1,286 @@
+#include <cassert>
+#include <memory>
+#include <unordered_set>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// Enable / disable OMPT's 'External Monitoring Interface'
+#define EMI 0
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+static ompt_get_record_type_t ompt_get_record_type_fn = 0;
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request_default_device(
+ int device_num, ompt_buffer_t **buffer, size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback for default "
+ "device %d\n",
+ *bytes, *buffer, device_num);
+}
+
+// Note: This callback must handle a null begin cursor. Currently,
+// ompt_get_record_ompt, print_record_ompt, and
+// ompt_advance_buffer_cursor handle a null cursor.
+static void on_ompt_callback_buffer_complete_default_device(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback for default device: \
+ device_num=%d, buffer=%p, num_bytes=%lu, begin=%p, buffer_owned=%d\n",
+ device_num, buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+
+ if (ompt_get_record_type_fn(buffer, current) != ompt_record_ompt) {
+ printf("WARNING: received non-ompt type buffer object\n");
+ }
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+ ompt_set_trace_ompt(Device, /*enable=*/1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, /*enable=*/1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, /*enable=*/1, ompt_callback_target_submit);
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+ if (device_num != omp_get_default_device())
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ return ompt_start_trace(Device,
+ &on_ompt_callback_buffer_request_default_device,
+ &on_ompt_callback_buffer_complete_default_device);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n", device_num,
+ type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ ompt_get_record_type_fn =
+ (ompt_get_record_type_t)lookup("ompt_get_record_type");
+ if (!ompt_get_record_type_fn) {
+ printf("WARNING: No function ompt_get_record_type found in device "
+ "callbacks\n");
+ }
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Load: device_num:%d filename:%s host_adddr:%p device_addr:%p "
+ "bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p "
+ "src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d "
+ "code=%p\n",
+ target_id, kind, endpoint, device_num, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-default-device/veccopy-ompt-target-default-device.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-default-device/veccopy-ompt-target-default-device.cpp
new file mode 100644
index 0000000000000..ccf11f88b9cb3
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-default-device/veccopy-ompt-target-default-device.cpp
@@ -0,0 +1,86 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+#include <cassert>
+#include <omp.h>
+#include <stdio.h>
+
+// This test starts device tracing on the default device only (see
+// start_trace in callbacks.h). However, if more devices are
+// available, it calls flush and stop on the other devices as
+// well. The intention is to check correct runtime behavior if a tool
+// invokes flush or stop on a device that was not started. The runtime
+// should just return without doing anything.
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+ // Warm up
+#pragma omp target
+ {}
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+ for (int dev = 0; dev < omp_get_num_devices(); ++dev) {
+#pragma omp target teams distribute parallel for device(dev)
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+ }
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+// > OMPT device tracing related checks below. <
+
+
+
+// Note: Split checks for record address and content. That way we do not imply
+// any order. Records may / will occur interleaved.
+
+// Note: These addresses will only occur once. They are only captured to
+// indicate their existence.
+
+/// CHECK-DAG: type=8 (Target task)
+/// CHECK-DAG: type=9 (Target data op)
+
+// Note: ADDRX_07 may not trigger a final callback.
+
+// Note: ADDRX_07 may not be deallocated.
+
+/// CHECK-NOT: host_op_id=0x0
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-devices/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-devices/callbacks.h
new file mode 100644
index 0000000000000..0625b1a7b1522
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-devices/callbacks.h
@@ -0,0 +1,324 @@
+#include <cassert>
+#include <memory>
+#include <unordered_set>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// Enable / disable OMPT's 'External Monitoring Interface'
+#define EMI 0
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+#define OMPT_BUFFER_REQUEST_SIZE 256
+
+// Map of devices traced
+typedef std::unordered_set<ompt_device_t *> DeviceMap_t;
+typedef std::unique_ptr<DeviceMap_t> DeviceMapPtr_t;
+extern DeviceMapPtr_t DeviceMapPtr;
+
+// Utilities
+static void print_record_ompt(ompt_record_ompt_t *rec) {
+ if (rec == NULL)
+ return;
+
+ switch (rec->type) {
+ case ompt_callback_target:
+ case ompt_callback_target_emi: {
+ ompt_record_target_t target_rec = rec->record.target;
+ printf("rec=%p type=%d (Target task) time=%lu thread_id=%lu "
+ "target_id=0x%lx kind=%d endpoint=%d device=%d task_id=%lu "
+ "codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_rec.kind, target_rec.endpoint, target_rec.device_num,
+ target_rec.task_id, target_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_data_op:
+ case ompt_callback_target_data_op_emi: {
+ ompt_record_target_data_op_t target_data_op_rec =
+ rec->record.target_data_op;
+ printf("rec=%p type=%d (Target data op) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx optype=%d "
+ "src_addr=%p src_device=%d dest_addr=%p dest_device=%d bytes=%lu "
+ "end_time=%lu duration=%lu ns codeptr=%p\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_data_op_rec.host_op_id, target_data_op_rec.optype,
+ target_data_op_rec.src_addr, target_data_op_rec.src_device_num,
+ target_data_op_rec.dest_addr, target_data_op_rec.dest_device_num,
+ target_data_op_rec.bytes, target_data_op_rec.end_time,
+ target_data_op_rec.end_time - rec->time,
+ target_data_op_rec.codeptr_ra);
+ break;
+ }
+ case ompt_callback_target_submit:
+ case ompt_callback_target_submit_emi: {
+ ompt_record_target_kernel_t target_kernel_rec = rec->record.target_kernel;
+ printf("rec=%p type=%d (Target kernel) time=%lu thread_id=%lu "
+ "target_id=0x%lx host_op_id=0x%lx requested_num_teams=%u "
+ "granted_num_teams=%u end_time=%lu duration=%lu ns\n",
+ rec, rec->type, rec->time, rec->thread_id, rec->target_id,
+ target_kernel_rec.host_op_id, target_kernel_rec.requested_num_teams,
+ target_kernel_rec.granted_num_teams, target_kernel_rec.end_time,
+ target_kernel_rec.end_time - rec->time);
+ break;
+ }
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static void delete_buffer_ompt(ompt_buffer_t *buffer) {
+ free(buffer);
+ printf("Deallocated %p\n", buffer);
+}
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_set_trace_ompt_t ompt_set_trace_ompt = 0;
+static ompt_start_trace_t ompt_start_trace = 0;
+static ompt_flush_trace_t ompt_flush_trace = 0;
+static ompt_stop_trace_t ompt_stop_trace = 0;
+static ompt_get_record_ompt_t ompt_get_record_ompt = 0;
+static ompt_advance_buffer_cursor_t ompt_advance_buffer_cursor = 0;
+static ompt_get_record_type_t ompt_get_record_type_fn = 0;
+// OMPT callbacks
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request_default_device(
+ int device_num, ompt_buffer_t **buffer, size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback for default "
+ "device %d\n",
+ *bytes, *buffer, device_num);
+}
+
+// Trace record callbacks
+static void on_ompt_callback_buffer_request(int device_num,
+ ompt_buffer_t **buffer,
+ size_t *bytes) {
+ *bytes = OMPT_BUFFER_REQUEST_SIZE;
+ *buffer = malloc(*bytes);
+ printf("Allocated %lu bytes at %p in buffer request callback for device %d\n",
+ *bytes, *buffer, device_num);
+}
+
+// Note: This callback must handle a null begin cursor. Currently,
+// ompt_get_record_ompt, print_record_ompt, and
+// ompt_advance_buffer_cursor handle a null cursor.
+static void on_ompt_callback_buffer_complete_default_device(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback for default device: \
+ device_num=%d, buffer=%p, num_bytes=%lu, begin=%p, buffer_owned=%d\n",
+ device_num, buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+
+ if (ompt_get_record_type_fn(buffer, current) != ompt_record_ompt) {
+ printf("WARNING: received non-ompt type buffer object\n");
+ }
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+// Note: This callback must handle a null begin cursor. Currently,
+// ompt_get_record_ompt, print_record_ompt, and
+// ompt_advance_buffer_cursor handle a null cursor.
+static void on_ompt_callback_buffer_complete(
+ int device_num, ompt_buffer_t *buffer,
+ size_t bytes, /* bytes returned in this callback */
+ ompt_buffer_cursor_t begin, int buffer_owned) {
+ printf("Executing buffer complete callback: \
+ device_num=%d, buffer=%p, num_bytes=%lu, begin=%p, buffer_owned=%d\n",
+ device_num, buffer, bytes, (void *)begin, buffer_owned);
+
+ int status = 1;
+ ompt_buffer_cursor_t current = begin;
+ while (status) {
+ ompt_record_ompt_t *rec = ompt_get_record_ompt(buffer, current);
+
+ if (ompt_get_record_type_fn(buffer, current) != ompt_record_ompt) {
+ printf("WARNING: received non-ompt type buffer object\n");
+ }
+ print_record_ompt(rec);
+ status = ompt_advance_buffer_cursor(NULL, /* TODO device */
+ buffer, bytes, current, ¤t);
+ }
+ if (buffer_owned)
+ delete_buffer_ompt(buffer);
+}
+
+static ompt_set_result_t set_trace_ompt(ompt_device_t *Device) {
+ if (!ompt_set_trace_ompt)
+ return ompt_set_error;
+
+ ompt_set_trace_ompt(Device, /*enable=*/1, ompt_callback_target);
+ ompt_set_trace_ompt(Device, /*enable=*/1, ompt_callback_target_data_op);
+ ompt_set_trace_ompt(Device, /*enable=*/1, ompt_callback_target_submit);
+
+ return ompt_set_always;
+}
+
+static int start_trace(int device_num, ompt_device_t *Device) {
+ if (!ompt_start_trace)
+ return 0;
+
+ // This device will be traced.
+ assert(DeviceMapPtr->find(Device) == DeviceMapPtr->end() &&
+ "Device already present in the map");
+ DeviceMapPtr->insert(Device);
+
+ if (device_num == omp_get_default_device())
+ return ompt_start_trace(Device,
+ &on_ompt_callback_buffer_request_default_device,
+ &on_ompt_callback_buffer_complete_default_device);
+ return ompt_start_trace(Device, &on_ompt_callback_buffer_request,
+ &on_ompt_callback_buffer_complete);
+}
+
+static int flush_trace(ompt_device_t *Device) {
+ if (!ompt_flush_trace)
+ return 0;
+ return ompt_flush_trace(Device);
+}
+
+static int stop_trace(ompt_device_t *Device) {
+ if (!ompt_stop_trace)
+ return 0;
+ return ompt_stop_trace(Device);
+}
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize(int device_num, const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation) {
+ printf("Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n", device_num,
+ type, device, lookup, documentation);
+ if (!lookup) {
+ printf("Trace collection disabled on device %d\n", device_num);
+ return;
+ }
+
+ ompt_set_trace_ompt = (ompt_set_trace_ompt_t)lookup("ompt_set_trace_ompt");
+ ompt_start_trace = (ompt_start_trace_t)lookup("ompt_start_trace");
+ ompt_flush_trace = (ompt_flush_trace_t)lookup("ompt_flush_trace");
+ ompt_stop_trace = (ompt_stop_trace_t)lookup("ompt_stop_trace");
+ ompt_get_record_ompt = (ompt_get_record_ompt_t)lookup("ompt_get_record_ompt");
+ ompt_advance_buffer_cursor =
+ (ompt_advance_buffer_cursor_t)lookup("ompt_advance_buffer_cursor");
+
+ ompt_get_record_type_fn =
+ (ompt_get_record_type_t)lookup("ompt_get_record_type");
+ if (!ompt_get_record_type_fn) {
+ printf("WARNING: No function ompt_get_record_type found in device "
+ "callbacks\n");
+ }
+
+ // DeviceMap must be initialized only once. Ensure this logic does not
+ // depend on external data structures because this init function may be
+ // called before main.
+ static bool IsDeviceMapInitialized = false;
+ if (!IsDeviceMapInitialized) {
+ DeviceMapPtr = std::make_unique<DeviceMap_t>();
+ IsDeviceMapInitialized = true;
+ }
+
+ set_trace_ompt(device);
+
+ start_trace(device_num, device);
+}
+
+static void on_ompt_callback_device_finalize(int device_num) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load(int device_num, const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file, size_t bytes,
+ void *host_addr, void *device_addr,
+ uint64_t module_id) {
+ printf("Load: device_num:%d filename:%s host_adddr:%p device_addr:%p "
+ "bytes:%lu\n",
+ device_num, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op(
+ ompt_id_t target_id, ompt_id_t host_op_id, ompt_target_data_op_t optype,
+ void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
+ size_t bytes, const void *codeptr_ra) {
+ printf(" Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p "
+ "src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
+ dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target(ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num, ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra) {
+ printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d "
+ "code=%p\n",
+ target_id, kind, endpoint, device_num, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit(ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+// Init functions
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+ ompt_data_t *tool_data) {
+ ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+
+ if (!ompt_set_callback)
+ return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+
+ return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version) {
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+ &ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-devices/veccopy-ompt-target-devices.cpp b/offload/test/ompt/tracing/veccopy-ompt-target-devices/veccopy-ompt-target-devices.cpp
new file mode 100644
index 0000000000000..7805e8c89de40
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-devices/veccopy-ompt-target-devices.cpp
@@ -0,0 +1,86 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+#include <cassert>
+#include <omp.h>
+#include <stdio.h>
+
+// This test starts device tracing on all available devices (see
+// start_trace in callbacks.h). It subsequently calls flush and stop
+// on all devices like a tool would normally do.
+
+#include "callbacks.h"
+
+// Map of devices traced
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+ // Warm up
+#pragma omp target
+ {}
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+ for (int dev = 0; dev < omp_get_num_devices(); ++dev) {
+#pragma omp target teams distribute parallel for device(dev)
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j];
+ }
+ }
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+// clang-format off
+
+// > OMPT device tracing related checks below. <
+
+// Note: Since this test may (theoretically) run on an arbitrary number of
+// devices, we may only check for the minimal set of records.
+
+
+
+// Note: Split checks for record address and content. That way we do not imply
+// any order. Records may / will occur interleaved.
+
+// Note: These addresses will only occur once. They are only captured to
+// indicate their existence.
+
+/// CHECK-DAG: type=8 (Target task)
+/// CHECK-DAG: type=9 (Target data op)
+
+// Note: ADDRX_07 may not trigger a final callback.
+
+// Note: ADDRX_07 may not be deallocated.
+
+/// CHECK-NOT: host_op_id=0x0
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-translate-time/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-translate-time/callbacks.h
new file mode 100644
index 0000000000000..5ce283c1a16b3
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-translate-time/callbacks.h
@@ -0,0 +1,166 @@
+#include <assert.h>
+#include <math.h>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_translate_time_t ompt_translate_time_fn = 0;
+static ompt_get_device_time_t ompt_get_device_time_fn = 0;
+
+// OMPT callbacks
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize
+(
+ int device_num,
+ const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation
+ ) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+
+ ompt_device_time_t device_time = 0;
+
+ ompt_translate_time_fn = (ompt_translate_time_t) lookup("ompt_translate_time");
+ ompt_get_device_time_fn = (ompt_get_device_time_t) lookup("ompt_get_device_time");
+
+ if (ompt_get_device_time_fn) {
+ uint64_t time = ompt_get_device_time_fn(NULL);
+ printf("The device time can be queried: %lu\n", time);
+ device_time = time;
+ } else {
+ printf("Could not determine the device time\n");
+ }
+
+ if (ompt_translate_time_fn) {
+ assert(device_time > 0 && "Obtain device time was successfull");
+ printf("Translate time available\n");
+ double omp_wtime = omp_get_wtime();
+ device_time = ompt_get_device_time_fn(NULL);
+ double omp_device_wtime = ompt_translate_time_fn(NULL, device_time);
+ printf("OMPT: Host time: %f, Device time: %f\n", omp_wtime, omp_device_wtime);
+ } else {
+ printf("Could not translate time.\n");
+ }
+}
+
+static void on_ompt_callback_device_finalize
+(
+ int device_num
+ ) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load
+ (
+ int device_num,
+ const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file,
+ size_t bytes,
+ void *host_addr,
+ void *device_addr,
+ uint64_t module_id
+ ) {
+ printf("Callback Load: device_num:%d module_id:%lu filename:%s host_adddr:%p device_addr:%p bytes:%lu\n",
+ device_num, module_id, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op
+ (
+ ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ ompt_target_data_op_t optype,
+ void *src_addr,
+ int src_device_num,
+ void *dest_addr,
+ int dest_device_num,
+ size_t bytes,
+ const void *codeptr_ra
+ ) {
+ assert(codeptr_ra != 0);
+ // Both src and dest must not be null
+ assert(src_addr != 0 || dest_addr != 0);
+ printf(" Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ target_id, host_op_id, optype, src_addr, src_device_num,
+ dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target
+ (
+ ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num,
+ ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra
+ ) {
+ assert(codeptr_ra != 0);
+ printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d code=%p\n",
+ target_id, kind, endpoint, device_num, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit
+ (
+ ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams
+ ) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+// Init functions
+int ompt_initialize(
+ ompt_function_lookup_t lookup,
+ int initial_device_num,
+ ompt_data_t *tool_data)
+{
+ ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+
+ if (!ompt_set_callback) return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+
+
+
+ return 1; //success
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(
+ unsigned int omp_version,
+ const char *runtime_version)
+{
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-translate-time/veccopy-ompt-target.c b/offload/test/ompt/tracing/veccopy-ompt-target-translate-time/veccopy-ompt-target.c
new file mode 100644
index 0000000000000..211da6aaacd33
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-translate-time/veccopy-ompt-target.c
@@ -0,0 +1,78 @@
+// clang-format off
+// RUN: %libomptarget-compile-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+#include <stdio.h>
+#include <omp.h>
+
+#include "callbacks.h"
+
+int main()
+{
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i=0; i<N; i++)
+ a[i]=0;
+
+ for (i=0; i<N; i++)
+ b[i]=i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j< N; j++)
+ a[j]=b[j];
+ }
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j< N; j++)
+ a[j]=b[j];
+ }
+
+ int rc = 0;
+ for (i=0; i<N; i++)
+ if (a[i] != b[i] ) {
+ rc++;
+ printf ("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+ /// CHECK: Callback Init:
+ /// CHECK: The device time can be queried: [[X:[0-9]+]]
+ /// CHECK: Translate time available
+ /// CHECK: Callback Load:
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+ /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/callbacks.h b/offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/callbacks.h
new file mode 100644
index 0000000000000..655e230275314
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/callbacks.h
@@ -0,0 +1,160 @@
+#include <assert.h>
+
+// Tool related code below
+#include <omp-tools.h>
+
+// From openmp/runtime/test/ompt/callback.h
+#define register_ompt_callback_t(name, type) \
+ do { \
+ type f_##name = &on_##name; \
+ if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never) \
+ printf("0: Could not register callback '" #name "'\n"); \
+ } while (0)
+
+#define register_ompt_callback(name) register_ompt_callback_t(name, name##_t)
+
+// OMPT entry point handles
+static ompt_set_callback_t ompt_set_callback = 0;
+static ompt_get_record_type_t ompt_get_record_type_fn = 0;
+static ompt_get_device_time_t ompt_get_device_time_fn = 0;
+
+// OMPT callbacks
+
+// Synchronous callbacks
+static void on_ompt_callback_device_initialize
+(
+ int device_num,
+ const char *type,
+ ompt_device_t *device,
+ ompt_function_lookup_t lookup,
+ const char *documentation
+ ) {
+ printf("Callback Init: device_num=%d type=%s device=%p lookup=%p doc=%p\n",
+ device_num, type, device, lookup, documentation);
+
+ ompt_get_record_type_fn = (ompt_get_record_type_t) lookup("ompt_get_record_type");
+ ompt_get_device_time_fn = (ompt_get_device_time_t) lookup("ompt_get_device_time");
+
+ if (ompt_get_record_type_fn) {
+ ompt_buffer_cursor_t buf;
+ // FIXME: For now, we pass a NULL buffer
+ ompt_record_t rec_type = ompt_get_record_type_fn(NULL, buf);
+ printf("Record Type: %s\n", ((rec_type == ompt_record_ompt) ? "OMPT" : "Unknown"));
+ } else {
+ printf("Could not determine Record Type");
+ }
+
+ if (ompt_get_device_time_fn) {
+ uint64_t time = ompt_get_device_time_fn(NULL);
+ printf("The device time can be queried: %lu\n", time);
+ } else {
+ printf("Could not determine the device time \n");
+ }
+}
+
+static void on_ompt_callback_device_finalize
+(
+ int device_num
+ ) {
+ printf("Callback Fini: device_num=%d\n", device_num);
+}
+
+static void on_ompt_callback_device_load
+ (
+ int device_num,
+ const char *filename,
+ int64_t offset_in_file,
+ void *vma_in_file,
+ size_t bytes,
+ void *host_addr,
+ void *device_addr,
+ uint64_t module_id
+ ) {
+ printf("Callback Load: device_num:%d module_id:%lu filename:%s host_adddr:%p device_addr:%p bytes:%lu\n",
+ device_num, module_id, filename, host_addr, device_addr, bytes);
+}
+
+static void on_ompt_callback_target_data_op
+ (
+ ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ ompt_target_data_op_t optype,
+ void *src_addr,
+ int src_device_num,
+ void *dest_addr,
+ int dest_device_num,
+ size_t bytes,
+ const void *codeptr_ra
+ ) {
+ assert(codeptr_ra != 0);
+ // Both src and dest must not be null
+ assert(src_addr != 0 || dest_addr != 0);
+ printf(" Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p src_device_num=%d "
+ "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
+ target_id, host_op_id, optype, src_addr, src_device_num,
+ dest_addr, dest_device_num, bytes, codeptr_ra);
+}
+
+static void on_ompt_callback_target
+ (
+ ompt_target_t kind,
+ ompt_scope_endpoint_t endpoint,
+ int device_num,
+ ompt_data_t *task_data,
+ ompt_id_t target_id,
+ const void *codeptr_ra
+ ) {
+ assert(codeptr_ra != 0);
+ printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d code=%p\n",
+ target_id, kind, endpoint, device_num, codeptr_ra);
+}
+
+static void on_ompt_callback_target_submit
+ (
+ ompt_id_t target_id,
+ ompt_id_t host_op_id,
+ unsigned int requested_num_teams
+ ) {
+ printf(" Callback Submit: target_id=%lu host_op_id=%lu req_num_teams=%d\n",
+ target_id, host_op_id, requested_num_teams);
+}
+
+// Init functions
+int ompt_initialize(
+ ompt_function_lookup_t lookup,
+ int initial_device_num,
+ ompt_data_t *tool_data)
+{
+ ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+
+ if (!ompt_set_callback) return 0; // failed
+
+ register_ompt_callback(ompt_callback_device_initialize);
+ register_ompt_callback(ompt_callback_device_finalize);
+ register_ompt_callback(ompt_callback_device_load);
+ register_ompt_callback(ompt_callback_target_data_op);
+ register_ompt_callback(ompt_callback_target);
+ register_ompt_callback(ompt_callback_target_submit);
+
+
+
+ return 1; //success
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t *ompt_start_tool(
+ unsigned int omp_version,
+ const char *runtime_version)
+{
+ static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+ return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/veccopy-ompt-target.c b/offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/veccopy-ompt-target.c
new file mode 100644
index 0000000000000..22374a691b8a3
--- /dev/null
+++ b/offload/test/ompt/tracing/veccopy-ompt-target-type-device-time/veccopy-ompt-target.c
@@ -0,0 +1,78 @@
+// clang-format off
+// RUN: %libomptarget-compile-run-and-check-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+#include <stdio.h>
+#include <omp.h>
+
+#include "callbacks.h"
+
+int main()
+{
+ int N = 100000;
+
+ int a[N];
+ int b[N];
+
+ int i;
+
+ for (i=0; i<N; i++)
+ a[i]=0;
+
+ for (i=0; i<N; i++)
+ b[i]=i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j< N; j++)
+ a[j]=b[j];
+ }
+
+#pragma omp target teams distribute parallel for
+ {
+ for (int j = 0; j< N; j++)
+ a[j]=b[j];
+ }
+
+ int rc = 0;
+ for (i=0; i<N; i++)
+ if (a[i] != b[i] ) {
+ rc++;
+ printf ("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+ /// CHECK: Callback Init:
+ /// CHECK: Record Type: OMPT
+ /// CHECK: The device time can be queried: [[X:[0-9]+]]
+ /// CHECK: Callback Load:
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+ /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+ /// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+ /// CHECK: Callback Fini:
>From 047db8e664592158a809d3a2b21298079552d83c Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 21 May 2026 09:11:16 -0500
Subject: [PATCH 12/15] [OMPT] Add test for synchronous path w/ OMPT
---
.../large_mapping_sync_tracing.cpp | 81 +++++++++++++++++++
1 file changed, 81 insertions(+)
create mode 100644 offload/test/ompt/tracing/large-mapping-sync-tracing/large_mapping_sync_tracing.cpp
diff --git a/offload/test/ompt/tracing/large-mapping-sync-tracing/large_mapping_sync_tracing.cpp b/offload/test/ompt/tracing/large-mapping-sync-tracing/large_mapping_sync_tracing.cpp
new file mode 100644
index 0000000000000..44939536d30f3
--- /dev/null
+++ b/offload/test/ompt/tracing/large-mapping-sync-tracing/large_mapping_sync_tracing.cpp
@@ -0,0 +1,81 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic && env LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES=1 LIBOMPTARGET_OMPT_FLUSH_ON_SHUTDOWN=false LIBOMPTARGET_OMPT_FLUSH_ON_BUFFER_FULL=false %libomptarget-run-generic | %fcheck-generic
+// REQUIRES: ompt
+// REQUIRES: amdgpu
+// clang-format on
+
+// Exercises OMPT device tracing for AMDGPU's synchronous large-copy path.
+// The transfer size is intentionally above the plugin's default async-copy
+// threshold, while the RUN line also lowers the threshold to keep the test
+// independent of future default changes.
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../veccopy-ompt-target-data-tracing-emi/callbacks.h"
+
+#define COPY_BYTES (50 * 1024 * 1024)
+
+DeviceMapPtr_t DeviceMapPtr;
+
+int main() {
+ if (omp_get_num_devices() == 0) {
+ printf("Success\n");
+ return 0;
+ }
+
+ const size_t N = COPY_BYTES / sizeof(int);
+ int *Input = (int *)malloc(COPY_BYTES);
+ int *Output = (int *)malloc(COPY_BYTES);
+
+ if (!Input || !Output) {
+ free(Input);
+ free(Output);
+ printf("Failure\n");
+ return 1;
+ }
+
+ for (size_t I = 0; I < N; ++I) {
+ Input[I] = (int)(I & 0x7fff);
+ Output[I] = 0;
+ }
+
+#pragma omp target teams distribute parallel for map(to : Input[0 : N]) \
+ map(from : Output[0 : N])
+ for (size_t I = 0; I < N; ++I)
+ Output[I] = Input[I] + 1;
+
+ for (auto Dev : *DeviceMapPtr)
+ flush_trace(Dev);
+
+ int Failed = 0;
+ for (size_t I = 0; I < N; ++I) {
+ if (Output[I] != Input[I] + 1) {
+ Failed = 1;
+ break;
+ }
+ }
+
+ free(Input);
+ free(Output);
+
+ printf("%s\n", Failed ? "Failure" : "Success");
+ return Failed;
+}
+
+// clang-format off
+
+/// CHECK: Callback Init:
+/// CHECK: Callback Load:
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=2 {{.*}} bytes=52428800
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=2 {{.*}} bytes=52428800
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=3 {{.*}} bytes=52428800
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=3 {{.*}} bytes=52428800
+/// CHECK-DAG: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK-DAG: Callback DataOp EMI: endpoint=2 optype=4
+
+/// CHECK-DAG: type=9 (Target data op) {{.*}} optype=1 {{.*}} bytes=52428800
+/// CHECK: Success
>From ea38101f1d05c23d128e585affcde9fc69a42871 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 21 May 2026 09:23:33 -0500
Subject: [PATCH 13/15] [OMPT] Remove ExecAsync flag
This was a leftover from the downstream port to upstream.
---
offload/include/Shared/APITypes.h | 6 ------
offload/include/device.h | 3 ---
offload/libomptarget/device.cpp | 11 +----------
.../plugins-nextgen/common/src/PluginInterface.cpp | 6 ------
4 files changed, 1 insertion(+), 25 deletions(-)
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index a06d5bd490943..592f6e2be1096 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -86,12 +86,6 @@ struct __tgt_async_info {
/// happening.
KernelLaunchEnvironmentTy KernelLaunchEnvironment;
- /// Whether this operation should execute asynchronously. When false, the
- /// runtime will synchronize the queue after the operation even if a queue
- /// is present. This allows profiling/tracing to use queue machinery while
- /// still enforcing synchronous completion boundaries.
- bool ExecAsync = true;
-
/// Opaque handle for profiler-specific data (e.g., OMPT trace record info).
/// Owned by the profiler; the runtime threads this pointer through the plugin
/// layer to associate async operations with trace records.
diff --git a/offload/include/device.h b/offload/include/device.h
index 5a4aa552a790e..af103c316c3cf 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -49,9 +49,6 @@ struct DeviceTy {
GenericPluginTy *RTL;
int32_t RTLDeviceID;
- /// Flag to force synchronous execution (used by OMPT device tracing)
- bool ForceSynchronousTargetRegions = false;
-
DeviceTy(GenericPluginTy *RTL, int32_t DeviceID, int32_t RTLDeviceID);
// DeviceTy is not copyable
DeviceTy(const DeviceTy &D) = delete;
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f13354a27c624..db5968b90ee78 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -72,7 +72,7 @@ int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
DeviceTy::DeviceTy(GenericPluginTy *RTL, int32_t DeviceID, int32_t RTLDeviceID)
: DeviceID(DeviceID), RTL(RTL), RTLDeviceID(RTLDeviceID),
- ForceSynchronousTargetRegions(false), MappingInfo(*this) {}
+ MappingInfo(*this) {}
DeviceTy::~DeviceTy() {
if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
@@ -82,11 +82,6 @@ DeviceTy::~DeviceTy() {
dumpTargetPointerMappings(&Loc, *this);
}
-inline void setAsyncInfoSynchronous(__tgt_async_info *AI, bool SetSynchronous) {
- if (SetSynchronous)
- AI->ExecAsync = false;
-}
-
llvm::Error DeviceTy::init() {
int32_t Ret = RTL->init_device(RTLDeviceID);
if (Ret != OFFLOAD_SUCCESS)
@@ -300,7 +295,6 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
HstPtrBegin, DeviceID, TgtPtrBegin, Size,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
- setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
AsyncInfo);
}
@@ -328,7 +322,6 @@ int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
omp_initial_device, HstPtrBegin, Size,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
- setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
AsyncInfo);
}
@@ -355,7 +348,6 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
DstDev.RTLDeviceID, DstPtr, Size,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
- setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
DstPtr, Size, AsyncInfo);
}
@@ -390,7 +382,6 @@ int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
KernelExtraArgsTy *KernelExtraArgs,
AsyncInfoTy &AsyncInfo) {
- setAsyncInfoSynchronous(AsyncInfo, ForceSynchronousTargetRegions);
return RTL->launch_kernel(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
&KernelArgs, KernelExtraArgs, AsyncInfo);
}
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 1186fd3562be0..db0c9819e69bb 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -65,12 +65,6 @@ void AsyncInfoWrapperTy::finalize(Error &Err) {
if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
Err = Device.synchronize(&LocalAsyncInfo);
- // When ExecAsync is false (e.g., for profiling/tracing), synchronize the
- // queue even if it's not the local async info, so completion callbacks run
- // before we return to the caller.
- if (AsyncInfoPtr && !AsyncInfoPtr->ExecAsync && AsyncInfoPtr->Queue && !Err)
- Err = Device.synchronize(AsyncInfoPtr);
-
// Invalidate the wrapper object.
AsyncInfoPtr = nullptr;
}
>From fbd7aaabc27c30ee75cc61fccc96e397424a2f37 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Tue, 2 Jun 2026 08:57:27 -0500
Subject: [PATCH 14/15] [OMPT] Fix use of debug namespace in OMPT code
---
.../libomptarget/OpenMP/OMPT/OmptTracing.cpp | 36 +++++++++----------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp b/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
index 2e7b431b71389..1045801842e90 100644
--- a/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
+++ b/offload/libomptarget/OpenMP/OMPT/OmptTracing.cpp
@@ -256,7 +256,7 @@ llvm::omp::target::ompt::setTraceEventTy(int DeviceId, unsigned int Enable,
return ompt_set_never;
}
- ODBG(ODT_Tool) << "Executing setTraceEventTy: DeviceId=" << DeviceId
+ ODBG(debug::ODT_Tool) << "Executing setTraceEventTy: DeviceId=" << DeviceId
<< " Enable=" << Enable << " EventTy=" << EventTy;
std::unique_lock<std::mutex> Lock(DeviceAccessMutex);
@@ -407,7 +407,7 @@ ompt_record_ompt_t *Interface::stopTargetDataAllocTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Generated trace record: " << DataPtr
+ ODBG(debug::ODT_Tool) << "Generated trace record: " << DataPtr
<< " (ompt_target_data_alloc)";
return DataPtr;
}
@@ -438,7 +438,7 @@ ompt_record_ompt_t *Interface::stopTargetDataDeleteTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Generated trace record: " << DataPtr
+ ODBG(debug::ODT_Tool) << "Generated trace record: " << DataPtr
<< " (ompt_target_data_delete)";
return DataPtr;
}
@@ -467,7 +467,7 @@ Interface::startTargetDataSubmitTrace(int64_t SrcDeviceId, void *SrcPtrBegin,
ompt_target_data_transfer_to_device, SrcPtrBegin,
SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
- ODBG(ODT_Tool) << "OMPT-Async: Returning data trace record buf ptr " << DataPtr;
+ ODBG(debug::ODT_Tool) << "OMPT-Async: Returning data trace record buf ptr " << DataPtr;
return DataPtr;
}
@@ -494,7 +494,7 @@ Interface::startTargetDataRetrieveTrace(int64_t SrcDeviceId, void *SrcPtrBegin,
ompt_target_data_transfer_from_device, SrcPtrBegin,
SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
- ODBG(ODT_Tool) << "OMPT-Async: Returning data trace record buf ptr " << DataPtr;
+ ODBG(debug::ODT_Tool) << "OMPT-Async: Returning data trace record buf ptr " << DataPtr;
return DataPtr;
}
@@ -510,7 +510,7 @@ ompt_record_ompt_t *Interface::stopTargetDataMovementTraceAsync(
OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "OMPT-Async: Completed target_data trace record " << DataPtr;
+ ODBG(debug::ODT_Tool) << "OMPT-Async: Completed target_data trace record " << DataPtr;
return DataPtr;
}
@@ -532,7 +532,7 @@ ompt_record_ompt_t *Interface::startTargetSubmitTrace(int64_t DeviceId,
DataPtr->record.target_kernel.host_op_id = getHostOpId();
// May be null if event is not traced
- ODBG(ODT_Tool) << "OMPT-Async: Returning kernel trace record buf ptr " << DataPtr;
+ ODBG(debug::ODT_Tool) << "OMPT-Async: Returning kernel trace record buf ptr " << DataPtr;
return DataPtr;
}
@@ -550,7 +550,7 @@ Interface::stopTargetSubmitTraceAsync(ompt_record_ompt_t *DataPtr,
OmptTracingBufferMgr *TRM = PM->getTraceRecordManager();
// Ready Record
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "OMPT-Async: Completed trace record buf ptr " << DataPtr;
+ ODBG(debug::ODT_Tool) << "OMPT-Async: Completed trace record buf ptr " << DataPtr;
return DataPtr;
}
@@ -574,7 +574,7 @@ ompt_record_ompt_t *Interface::startTargetDataEnterTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ ODBG(debug::ODT_Tool) << "Returning trace record buf ptr " << DataPtr
<< " (ompt_target_enter_data)";
return DataPtr;
}
@@ -599,7 +599,7 @@ ompt_record_ompt_t *Interface::stopTargetDataEnterTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ ODBG(debug::ODT_Tool) << "Generated trace record " << DataPtr
<< " (ompt_target_enter_data)";
return DataPtr;
}
@@ -624,7 +624,7 @@ ompt_record_ompt_t *Interface::startTargetDataExitTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ ODBG(debug::ODT_Tool) << "Returning trace record buf ptr " << DataPtr
<< " (ompt_target_exit_data)";
return DataPtr;
}
@@ -649,7 +649,7 @@ ompt_record_ompt_t *Interface::stopTargetDataExitTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ ODBG(debug::ODT_Tool) << "Generated trace record " << DataPtr
<< " (ompt_target_exit_data)";
return DataPtr;
}
@@ -674,7 +674,7 @@ ompt_record_ompt_t *Interface::startTargetUpdateTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ ODBG(debug::ODT_Tool) << "Returning trace record buf ptr " << DataPtr
<< " (ompt_target_update)";
return DataPtr;
}
@@ -699,7 +699,7 @@ ompt_record_ompt_t *Interface::stopTargetUpdateTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ ODBG(debug::ODT_Tool) << "Generated trace record " << DataPtr
<< " (ompt_target_update)";
return DataPtr;
}
@@ -724,7 +724,7 @@ ompt_record_ompt_t *Interface::startTargetTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Returning trace record buf ptr " << DataPtr
+ ODBG(debug::ODT_Tool) << "Returning trace record buf ptr " << DataPtr
<< " (ompt_target)";
return DataPtr;
}
@@ -750,7 +750,7 @@ ompt_record_ompt_t *Interface::stopTargetTrace(int64_t DeviceId,
// The trace record has been created, mark it ready for delivery to the tool
TRM->setTRStatus(DataPtr, OmptTracingBufferMgr::TR_ready);
- ODBG(ODT_Tool) << "Generated trace record " << DataPtr
+ ODBG(debug::ODT_Tool) << "Generated trace record " << DataPtr
<< " (ompt_target)";
return DataPtr;
}
@@ -839,7 +839,7 @@ int libomptarget_ompt_advance_buffer_cursor(ompt_device_t *Device,
ompt_buffer_cursor_t CurrentPos,
ompt_buffer_cursor_t *NextPos) {
if (!PM) {
- REPORT() << "Failed to advance buffer cursor for Device="
+ REPORT() << "Failed to advance buffer cursor for Device="
<< Device << " (invalid plugin manager)";
// Indicate failure
return false;
@@ -856,7 +856,7 @@ int libomptarget_ompt_advance_buffer_cursor(ompt_device_t *Device,
size_t TRSize = TRM->getTRSize();
*NextPos = (ompt_buffer_cursor_t)(TraceRecord + TRSize);
- ODBG(ODT_Tool) << "Advanced buffer pointer by "
+ ODBG(llvm::omp::target::debug::ODT_Tool) << "Advanced buffer pointer by "
<< TRSize << " bytes to "
<< TraceRecord + TRSize;
return true;
>From c014b66e0b923e0175b2abcaa27de48b5fea4eaa Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Tue, 2 Jun 2026 08:58:22 -0500
Subject: [PATCH 15/15] [OMPT] Skip TRM delete when offload is disabled
The TRM is not initialized when offloading is disabled, therefore we
should also not delete it.
---
offload/libomptarget/PluginManager.cpp | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp
index a91e3ca954d5c..33c35b4f8db17 100644
--- a/offload/libomptarget/PluginManager.cpp
+++ b/offload/libomptarget/PluginManager.cpp
@@ -64,10 +64,13 @@ void PluginManager::deinit() {
ODBG(ODT_Deinit) << "Unloading RTLs...";
#ifdef OMPT_SUPPORT
- assert(TraceRecordManager != nullptr &&
- "Trace record manager should have been non-null");
- delete TraceRecordManager;
- TraceRecordManager = nullptr;
+ // When offloading is disabled, the TRM is not initialized.
+ if (!OffloadPolicy::isOffloadDisabled()) {
+ assert(TraceRecordManager != nullptr &&
+ "Trace record manager should have been non-null");
+ delete TraceRecordManager;
+ TraceRecordManager = nullptr;
+ }
#endif
for (auto &Plugin : Plugins) {
More information about the Openmp-commits
mailing list