[Openmp-commits] [lld] [llvm] [openmp] [OpenMP] Improve omp offload profiler (PR #68016)
Felipe Cabarcas via Openmp-commits
openmp-commits at lists.llvm.org
Fri Dec 22 11:10:48 PST 2023
https://github.com/fel-cab updated https://github.com/llvm/llvm-project/pull/68016
>From 16e38453b6b9066824020829b1e22ab44dd5706b Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Mon, 18 Sep 2023 12:07:12 +0000
Subject: [PATCH 01/14] testing Profiler features
---
openmp/libomptarget/src/interface.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d9e87640161f26..8b9f146695bb63 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -241,7 +241,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_IDENT(Loc);
+ //TIMESCOPE_WITH_IDENT(Loc);
+ TIMESCOPE();
+ //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
+ //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
>From 0f850824541175eae650f0eac87a6425d5275b88 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 12:02:53 +0000
Subject: [PATCH 02/14] Improve Profiler 1
---
llvm/lib/Support/TimeProfiler.cpp | 2 +-
openmp/libomptarget/src/interface.cpp | 14 ++++++--------
openmp/libomptarget/src/omptarget.cpp | 10 +++++-----
3 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4d625b3eb5b170..e1458116f64ab4 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -227,7 +227,7 @@ struct llvm::TimeTraceProfiler {
J.attribute("ph", "X");
J.attribute("ts", 0);
J.attribute("dur", DurUs);
- J.attribute("name", "Total " + Total.first);
+ J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
J.attribute("avg ms", int64_t(DurUs / Count / 1000));
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 8b9f146695bb63..b98d33cce22ecb 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
////////////////////////////////////////////////////////////////////////////////
/// adds requires flags
EXTERN void __tgt_register_requires(int64_t Flags) {
- TIMESCOPE();
PM->addRequirements(Flags);
}
////////////////////////////////////////////////////////////////////////////////
/// adds a target shared library to the target execution image
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
- TIMESCOPE();
if (PM->delayRegisterLib(Desc))
return;
@@ -54,7 +52,6 @@ EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
////////////////////////////////////////////////////////////////////////////////
/// unloads a target shared library
EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
- TIMESCOPE();
PM->unregisterLib(Desc);
}
@@ -68,7 +65,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+ //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+ TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -242,9 +240,9 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
//TIMESCOPE_WITH_IDENT(Loc);
- TIMESCOPE();
+ //TIMESCOPE();
//TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
- //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
+ //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
@@ -405,7 +403,7 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
// Get the current number of components for a user-defined mapper.
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
- TIMESCOPE();
+ //TIMESCOPE();
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
int64_t Size = MapperComponentsPtr->Components.size();
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -417,7 +415,7 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
void *Begin, int64_t Size, int64_t Type,
void *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("__tgt_push_mapper_component(Handle=" DPxMOD
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e724b2f6db8b5f..90c30d391bd8cb 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -392,7 +392,7 @@ static int32_t getParentIndex(int64_t Type) {
void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -419,7 +419,7 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));
@@ -444,7 +444,7 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -471,7 +471,7 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
}
void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,7 +531,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
bool FromMapper) {
- TIMESCOPE_WITH_IDENT(Loc);
+ //TIMESCOPE_WITH_IDENT(Loc);
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
// Ignore private variables and arrays - there is no mapping for them.
>From 498cb39e8054245db88f9943722386d87ac6e17c Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 21:33:24 +0000
Subject: [PATCH 03/14] Changed profiling to work in nanoseconds. Made
Profiling calls for runtime calls and different ones for kernel lunches and
memory transfers.
---
llvm/lib/Support/TimeProfiler.cpp | 28 +++++++++++++--------------
openmp/libomptarget/src/interface.cpp | 7 ++-----
openmp/libomptarget/src/omptarget.cpp | 10 ++++++----
3 files changed, 22 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index e1458116f64ab4..64b3ef35be27c4 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -30,7 +30,7 @@ namespace {
using std::chrono::duration;
using std::chrono::duration_cast;
-using std::chrono::microseconds;
+using std::chrono::nanoseconds;
using std::chrono::steady_clock;
using std::chrono::system_clock;
using std::chrono::time_point;
@@ -80,14 +80,14 @@ struct TimeTraceProfilerEntry {
// rather than casting duration. This avoids truncation issues causing inner
// scopes overruning outer scopes.
ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
- return (time_point_cast<microseconds>(Start) -
- time_point_cast<microseconds>(StartTime))
+ return (time_point_cast<nanoseconds>(Start) -
+ time_point_cast<nanoseconds>(StartTime))
.count();
}
ClockType::rep getFlameGraphDurUs() const {
- return (time_point_cast<microseconds>(End) -
- time_point_cast<microseconds>(Start))
+ return (time_point_cast<nanoseconds>(End) -
+ time_point_cast<nanoseconds>(Start))
.count();
}
};
@@ -123,7 +123,7 @@ struct llvm::TimeTraceProfiler {
DurationType Duration = E.End - E.Start;
// Only include sections longer or equal to TimeTraceGranularity msec.
- if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
+ if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity)
Entries.emplace_back(E);
// Track total time taken by each "name", but only the topmost levels of
@@ -169,8 +169,8 @@ struct llvm::TimeTraceProfiler {
J.attribute("pid", Pid);
J.attribute("tid", int64_t(Tid));
J.attribute("ph", "X");
- J.attribute("ts", StartUs);
- J.attribute("dur", DurUs);
+ J.attribute("ts", StartUs / 1000);
+ J.attribute("dur", DurUs / 1000);
J.attribute("name", E.Name);
if (!E.Detail.empty()) {
J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
// Report totals on separate threads of tracing file.
uint64_t TotalTid = MaxTid + 1;
for (const NameAndCountAndDurationType &Total : SortedTotals) {
- auto DurUs = duration_cast<microseconds>(Total.second.second).count();
+ auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
auto Count = AllCountAndTotalPerName[Total.first].first;
J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurUs);
+ J.attribute("dur", DurUs / 1000);
J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
- J.attribute("avg ms", int64_t(DurUs / Count / 1000));
+ J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
});
});
@@ -262,9 +262,9 @@ struct llvm::TimeTraceProfiler {
// This can be used to combine the profiling data from
// multiple processes and preserve actual time intervals.
J.attribute("beginningOfTime",
- time_point_cast<microseconds>(BeginningOfTime)
+ time_point_cast<nanoseconds>(BeginningOfTime)
.time_since_epoch()
- .count());
+ .count()/1000);
J.objectEnd();
}
@@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler {
SmallString<0> ThreadName;
const uint64_t Tid;
- // Minimum time granularity (in microseconds)
+ // Minimum time granularity (in nanoseconds)
const unsigned TimeTraceGranularity;
};
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index b98d33cce22ecb..d6c6a25ac74234 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -66,7 +66,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
//TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
- TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
+ TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -239,10 +239,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
- //TIMESCOPE_WITH_IDENT(Loc);
- //TIMESCOPE();
- //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
- //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
+ TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 90c30d391bd8cb..e1f90093341c88 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -821,6 +821,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
+ //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
int Ret = OFFLOAD_SUCCESS;
auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
// process each input.
@@ -913,7 +914,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-
+ TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1403,7 +1404,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
SmallVector<ptrdiff_t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
@@ -1456,6 +1456,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
DPxPTR(HstPtrVal));
continue;
}
+ TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret =
@@ -1537,7 +1538,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
+
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -1564,6 +1565,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
return Ret;
});
+
return OFFLOAD_SUCCESS;
}
} // namespace
@@ -1639,7 +1641,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
- TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
+ TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
>From 5c40b78648128b50d464bbb360aaf4b1c72ff669 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Thu, 21 Sep 2023 14:22:28 +0000
Subject: [PATCH 04/14] test with DevToHost
---
openmp/libomptarget/src/omptarget.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e1f90093341c88..90b4dc92c299f8 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -914,7 +914,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
+ std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
+ TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
>From c12fd356e5ea3cb585065e433f413f6603db401c Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Fri, 22 Sep 2023 21:48:57 +0000
Subject: [PATCH 05/14] Fixing nanoseconds in totals, adding syncronize
timings, and adding extra info in kernels and device
---
llvm/lib/Support/TimeProfiler.cpp | 24 ++++++++++++------------
openmp/libomptarget/src/interface.cpp | 15 ++++++++-------
openmp/libomptarget/src/omptarget.cpp | 17 +++++++++--------
3 files changed, 29 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 64b3ef35be27c4..4446583102a813 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -79,13 +79,13 @@ struct TimeTraceProfilerEntry {
// Calculate timings for FlameGraph. Cast time points to microsecond precision
// rather than casting duration. This avoids truncation issues causing inner
// scopes overruning outer scopes.
- ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
+ ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const {
return (time_point_cast<nanoseconds>(Start) -
time_point_cast<nanoseconds>(StartTime))
.count();
}
- ClockType::rep getFlameGraphDurUs() const {
+ ClockType::rep getFlameGraphDurNs() const {
return (time_point_cast<nanoseconds>(End) -
time_point_cast<nanoseconds>(Start))
.count();
@@ -114,9 +114,9 @@ struct llvm::TimeTraceProfiler {
// Check that end times monotonically increase.
assert((Entries.empty() ||
- (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
- Entries.back().getFlameGraphStartUs(StartTime) +
- Entries.back().getFlameGraphDurUs())) &&
+ (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >=
+ Entries.back().getFlameGraphStartNs(StartTime) +
+ Entries.back().getFlameGraphDurNs())) &&
"TimeProfiler scope ended earlier than previous scope");
// Calculate duration at full precision for overall counts.
@@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler {
// Emit all events for the main flame graph.
auto writeEvent = [&](const auto &E, uint64_t Tid) {
- auto StartUs = E.getFlameGraphStartUs(StartTime);
- auto DurUs = E.getFlameGraphDurUs();
+ auto StartNs = E.getFlameGraphStartNs(StartTime);
+ auto DurNs = E.getFlameGraphDurNs();
J.object([&] {
J.attribute("pid", Pid);
J.attribute("tid", int64_t(Tid));
J.attribute("ph", "X");
- J.attribute("ts", StartUs / 1000);
- J.attribute("dur", DurUs / 1000);
+ J.attribute("ts", StartNs / 1000);
+ J.attribute("dur", DurNs / 1000);
J.attribute("name", E.Name);
if (!E.Detail.empty()) {
J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
// Report totals on separate threads of tracing file.
uint64_t TotalTid = MaxTid + 1;
for (const NameAndCountAndDurationType &Total : SortedTotals) {
- auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
+ auto DurNs = duration_cast<nanoseconds>(Total.second.second).count();
auto Count = AllCountAndTotalPerName[Total.first].first;
J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurUs / 1000);
+ J.attribute("dur", DurNs / 1000 );
J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
- J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
+ J.attribute("avg us", int64_t(DurNs / Count / 1000));
});
});
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d6c6a25ac74234..f69d9d5330d2df 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -66,7 +66,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
//TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
- TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -238,9 +238,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
KernelArgsTy *KernelArgs) {
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-
- TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
-
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
DeviceId, DPxPTR(HostPtr));
@@ -265,7 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
+ "NumTeams="+std::to_string(NumTeams)+
+ ";NumArgs="+std::to_string(KernelArgs->NumArgs)
+ , Loc);
+
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -292,7 +293,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
/* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
+
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
@@ -301,7 +302,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-
+ }
return OMP_TGT_SUCCESS;
}
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 90b4dc92c299f8..46f8dcb0f087af 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -538,7 +538,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -821,7 +821,6 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
- //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
int Ret = OFFLOAD_SUCCESS;
auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
// process each input.
@@ -914,8 +913,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
- TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1456,8 +1454,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
"variable (" DPxMOD ")\n",
DPxPTR(HstPtrVal));
continue;
- }
- TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
+ }
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret =
@@ -1642,8 +1639,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
- TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
+ "NumArguments="+std::to_string(KernelArgs.NumArgs)
+ +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+ +";TripCount="+std::to_string(KernelArgs.Tripcount)
+ , Loc);
+
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
"Multi dimensional launch not supported yet.");
>From f27edf4922bc3a1cca656cf24655dae15c72b571 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 15:58:50 +0000
Subject: [PATCH 06/14] Some fixes to the profiler
---
openmp/libomptarget/src/OpenMP/API.cpp | 7 +++++++
openmp/libomptarget/src/interface.cpp | 7 ++-----
openmp/libomptarget/src/omptarget.cpp | 5 -----
3 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 1769404faf8884..27802d86aff64e 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,6 +83,7 @@ EXTERN int omp_get_initial_device(void) {
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
+ TIMESCOPE();
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -99,6 +100,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
}
EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
+ TIMESCOPE();
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -162,6 +164,11 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
TIMESCOPE();
+ /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
+ "NumArguments="+std::to_string(KernelArgs.NumArgs)
+ +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+ +";TripCount="+std::to_string(KernelArgs.Tripcount)
+ , __FUNCTION__);*/
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index f69d9d5330d2df..5eca63fdc39bda 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,6 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -300,8 +299,8 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();
- handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
- assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
+ handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+ assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
}
return OMP_TGT_SUCCESS;
}
@@ -401,7 +400,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
// Get the current number of components for a user-defined mapper.
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
- //TIMESCOPE();
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
int64_t Size = MapperComponentsPtr->Components.size();
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -413,7 +411,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
void *Begin, int64_t Size, int64_t Type,
void *Name) {
- //TIMESCOPE();
DP("__tgt_push_mapper_component(Handle=" DPxMOD
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 46f8dcb0f087af..e0fd34f24e7e75 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -392,7 +392,6 @@ static int32_t getParentIndex(int64_t Type) {
void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -419,7 +418,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));
@@ -444,7 +442,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -471,7 +468,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
}
void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,7 +527,6 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
bool FromMapper) {
- //TIMESCOPE_WITH_IDENT(Loc);
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
// Ignore private variables and arrays - there is no mapping for them.
>From 2693c8ece70e6a246be3912d84248271584407e9 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 21:06:06 +0000
Subject: [PATCH 07/14] Adding information to some omp api calls
---
openmp/libomptarget/src/OpenMP/API.cpp | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 27802d86aff64e..9150bcd0789e81 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,7 +83,8 @@ EXTERN int omp_get_initial_device(void) {
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum)
+ +";size="+std::to_string(Size));
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -163,12 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
- TIMESCOPE();
- /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
- "NumArguments="+std::to_string(KernelArgs.NumArgs)
- +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
- +";TripCount="+std::to_string(KernelArgs.Tripcount)
- , __FUNCTION__);*/
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+ +";src_dev="+std::to_string(SrcDevice)
+ +";size="+std::to_string(Length));
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -407,7 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+ +";src_dev="+std::to_string(SrcDevice)
+ +";size="+std::to_string(Length));
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -436,7 +436,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions,
int DstDevice, int SrcDevice) {
- TIMESCOPE();
DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -495,7 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+ +";src_dev="+std::to_string(SrcDevice)
+ +";size="+std::to_string(ElementSize)
+ +";num_dims="+std::to_string(NumDims));
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
>From 15b301f3ddb9f38ae8576c682a91c9e1a40a58c9 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 12:26:51 +0000
Subject: [PATCH 08/14] Adding information to the LIBOMPTARGET profiler runtime
kernel and API calls.
---
openmp/libomptarget/src/interface.cpp | 14 ++++++++------
openmp/libomptarget/src/omptarget.cpp | 24 +++++++++++++++---------
2 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 5eca63fdc39bda..19fd6b21f540ec 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,9 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+ "NumArgs="+
+ std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -262,10 +264,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
- "NumTeams="+std::to_string(NumTeams)+
- ";NumArgs="+std::to_string(KernelArgs->NumArgs)
- , Loc);
-
+ "NumTeams="+std::to_string(NumTeams)+
+ ";NumArgs="+
+ std::to_string(KernelArgs->NumArgs), Loc);
+
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -292,7 +294,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
/* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
+
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e0fd34f24e7e75..890625263d946a 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -533,7 +533,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
- TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev",
+ "Size="+
+ std::to_string(ArgSizes[I])+
+ "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -908,7 +911,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+
+ std::to_string(DataSize)+"B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1449,7 +1453,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
"variable (" DPxMOD ")\n",
DPxPTR(HstPtrVal));
continue;
- }
+ }
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret =
@@ -1558,7 +1562,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
return Ret;
});
-
return OFFLOAD_SUCCESS;
}
} // namespace
@@ -1635,11 +1638,14 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
- "NumArguments="+std::to_string(KernelArgs.NumArgs)
- +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
- +";TripCount="+std::to_string(KernelArgs.Tripcount)
- , Loc);
-
+ "NumArguments="+
+ std::to_string(KernelArgs.NumArgs)+
+ ";NumTeams="+
+ std::to_string(KernelArgs.NumTeams[0])+
+ ";TripCount="+
+ std::to_string(KernelArgs.Tripcount)
+ , Loc);
+
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
"Multi dimensional launch not supported yet.");
>From c24569ad0e59fc8c90f8ce272b8a43eba1ab0b06 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 19:14:01 +0000
Subject: [PATCH 09/14] Fixing format
---
llvm/lib/Support/TimeProfiler.cpp | 10 +++++-----
openmp/libomptarget/src/OpenMP/API.cpp | 24 ++++++++++++------------
openmp/libomptarget/src/interface.cpp | 12 ++++++------
openmp/libomptarget/src/omptarget.cpp | 24 ++++++++++--------------
4 files changed, 33 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4446583102a813..330a4d93378aff 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -226,7 +226,7 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurNs / 1000 );
+ J.attribute("dur", DurNs / 1000);
J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
@@ -261,10 +261,10 @@ struct llvm::TimeTraceProfiler {
// Emit the absolute time when this TimeProfiler started.
// This can be used to combine the profiling data from
// multiple processes and preserve actual time intervals.
- J.attribute("beginningOfTime",
- time_point_cast<nanoseconds>(BeginningOfTime)
- .time_since_epoch()
- .count()/1000);
+ J.attribute("beginningOfTime", time_point_cast<nanoseconds>(BeginningOfTime)
+ .time_since_epoch()
+ .count() /
+ 1000);
J.objectEnd();
}
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 9150bcd0789e81..a7b6eac8bcd658 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,8 +83,8 @@ EXTERN int omp_get_initial_device(void) {
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum)
- +";size="+std::to_string(Size));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
+ ";size=" + std::to_string(Size));
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -164,9 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
- +";src_dev="+std::to_string(SrcDevice)
- +";size="+std::to_string(Length));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -405,9 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
- +";src_dev="+std::to_string(SrcDevice)
- +";size="+std::to_string(Length));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -494,10 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
- +";src_dev="+std::to_string(SrcDevice)
- +";size="+std::to_string(ElementSize)
- +";num_dims="+std::to_string(NumDims));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(ElementSize) +
+ ";num_dims=" + std::to_string(NumDims));
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 19fd6b21f540ec..88323778fe709e 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -66,8 +66,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
- "NumArgs="+
- std::to_string(ArgNum), Loc);
+ "NumArgs=" + std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -263,10 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
- "NumTeams="+std::to_string(NumTeams)+
- ";NumArgs="+
- std::to_string(KernelArgs->NumArgs), Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "Runtime target exe",
+ "NumTeams=" + std::to_string(NumTeams) +
+ ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
+ Loc);
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 890625263d946a..a7d55d7ebd5391 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -533,10 +533,8 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
- TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev",
- "Size="+
- std::to_string(ArgSizes[I])+
- "B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -911,8 +909,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+
- std::to_string(DataSize)+"B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1637,14 +1635,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
- "NumArguments="+
- std::to_string(KernelArgs.NumArgs)+
- ";NumTeams="+
- std::to_string(KernelArgs.NumTeams[0])+
- ";TripCount="+
- std::to_string(KernelArgs.Tripcount)
- , Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "Kernel Target",
+ "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
+ ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
+ ";TripCount=" + std::to_string(KernelArgs.Tripcount),
+ Loc);
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
>From be468b177b6e18001dfe20bad804953919c10617 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 20:51:40 +0000
Subject: [PATCH 10/14] Change of lld/test/MachO/map-file.s: adding : to the
test check, given that the profile added the colons to make the total more
clear in the trace
---
lld/test/MachO/map-file.s | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index aa9fff9938eb28..279a15b8e33e60 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -89,7 +89,7 @@
# CHECK-NEXT: 0x[[#%X,BSS]] 0x00000001 [ 2] _number
# CHECK-EMPTY:
-# MAPFILE: "name":"Total Write map file"
+# MAPFILE: "name":"Total: Write map file"
# RUN: %lld -demangle -dead_strip -map %t/stripped-map %t/test.o -force_load \
# RUN: %t/libfoo.a %t/c-string-literal.o %t/libbaz.dylib -o %t/stripped
>From f95eb17f60284e94de10b80547e80946d9ef2f81 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Tue, 3 Oct 2023 12:10:13 +0000
Subject: [PATCH 11/14] Removing TimeProfiler microsecond to nanosecond change
from this PR, it will be a separate PR
---
llvm/lib/Support/TimeProfiler.cpp | 48 +++++++++++++++----------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 330a4d93378aff..4d625b3eb5b170 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -30,7 +30,7 @@ namespace {
using std::chrono::duration;
using std::chrono::duration_cast;
-using std::chrono::nanoseconds;
+using std::chrono::microseconds;
using std::chrono::steady_clock;
using std::chrono::system_clock;
using std::chrono::time_point;
@@ -79,15 +79,15 @@ struct TimeTraceProfilerEntry {
// Calculate timings for FlameGraph. Cast time points to microsecond precision
// rather than casting duration. This avoids truncation issues causing inner
// scopes overruning outer scopes.
- ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const {
- return (time_point_cast<nanoseconds>(Start) -
- time_point_cast<nanoseconds>(StartTime))
+ ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
+ return (time_point_cast<microseconds>(Start) -
+ time_point_cast<microseconds>(StartTime))
.count();
}
- ClockType::rep getFlameGraphDurNs() const {
- return (time_point_cast<nanoseconds>(End) -
- time_point_cast<nanoseconds>(Start))
+ ClockType::rep getFlameGraphDurUs() const {
+ return (time_point_cast<microseconds>(End) -
+ time_point_cast<microseconds>(Start))
.count();
}
};
@@ -114,16 +114,16 @@ struct llvm::TimeTraceProfiler {
// Check that end times monotonically increase.
assert((Entries.empty() ||
- (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >=
- Entries.back().getFlameGraphStartNs(StartTime) +
- Entries.back().getFlameGraphDurNs())) &&
+ (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
+ Entries.back().getFlameGraphStartUs(StartTime) +
+ Entries.back().getFlameGraphDurUs())) &&
"TimeProfiler scope ended earlier than previous scope");
// Calculate duration at full precision for overall counts.
DurationType Duration = E.End - E.Start;
// Only include sections longer or equal to TimeTraceGranularity msec.
- if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity)
+ if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
Entries.emplace_back(E);
// Track total time taken by each "name", but only the topmost levels of
@@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler {
// Emit all events for the main flame graph.
auto writeEvent = [&](const auto &E, uint64_t Tid) {
- auto StartNs = E.getFlameGraphStartNs(StartTime);
- auto DurNs = E.getFlameGraphDurNs();
+ auto StartUs = E.getFlameGraphStartUs(StartTime);
+ auto DurUs = E.getFlameGraphDurUs();
J.object([&] {
J.attribute("pid", Pid);
J.attribute("tid", int64_t(Tid));
J.attribute("ph", "X");
- J.attribute("ts", StartNs / 1000);
- J.attribute("dur", DurNs / 1000);
+ J.attribute("ts", StartUs);
+ J.attribute("dur", DurUs);
J.attribute("name", E.Name);
if (!E.Detail.empty()) {
J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
// Report totals on separate threads of tracing file.
uint64_t TotalTid = MaxTid + 1;
for (const NameAndCountAndDurationType &Total : SortedTotals) {
- auto DurNs = duration_cast<nanoseconds>(Total.second.second).count();
+ auto DurUs = duration_cast<microseconds>(Total.second.second).count();
auto Count = AllCountAndTotalPerName[Total.first].first;
J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurNs / 1000);
- J.attribute("name", "Total: " + Total.first);
+ J.attribute("dur", DurUs);
+ J.attribute("name", "Total " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
- J.attribute("avg us", int64_t(DurNs / Count / 1000));
+ J.attribute("avg ms", int64_t(DurUs / Count / 1000));
});
});
@@ -261,10 +261,10 @@ struct llvm::TimeTraceProfiler {
// Emit the absolute time when this TimeProfiler started.
// This can be used to combine the profiling data from
// multiple processes and preserve actual time intervals.
- J.attribute("beginningOfTime", time_point_cast<nanoseconds>(BeginningOfTime)
- .time_since_epoch()
- .count() /
- 1000);
+ J.attribute("beginningOfTime",
+ time_point_cast<microseconds>(BeginningOfTime)
+ .time_since_epoch()
+ .count());
J.objectEnd();
}
@@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler {
SmallString<0> ThreadName;
const uint64_t Tid;
- // Minimum time granularity (in nanoseconds)
+ // Minimum time granularity (in microseconds)
const unsigned TimeTraceGranularity;
};
>From e78708d9428d8cd1481a9105ca565c9838ea02ef Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Tue, 3 Oct 2023 14:09:18 +0000
Subject: [PATCH 12/14] restoring lld/test/MachO/map-file.s test, because the
change was necesary for the changes to the llvm/lib/Support/TimeProfiler.cpp,
which was removed from this PR
---
lld/test/MachO/map-file.s | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index 279a15b8e33e60..aa9fff9938eb28 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -89,7 +89,7 @@
# CHECK-NEXT: 0x[[#%X,BSS]] 0x00000001 [ 2] _number
# CHECK-EMPTY:
-# MAPFILE: "name":"Total: Write map file"
+# MAPFILE: "name":"Total Write map file"
# RUN: %lld -demangle -dead_strip -map %t/stripped-map %t/test.o -force_load \
# RUN: %t/libfoo.a %t/c-string-literal.o %t/libbaz.dylib -o %t/stripped
>From bf5d83c3f879a649638bab2a4c75241e23bb6dab Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Fri, 27 Oct 2023 20:58:26 +0000
Subject: [PATCH 13/14] Adding kernel function name to the slice name
---
openmp/libomptarget/src/interface.cpp | 4 ++--
openmp/libomptarget/src/omptarget.cpp | 6 +++---
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 88323778fe709e..86c95416c41db4 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime Data Copy:",
"NumArgs=" + std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -263,7 +263,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "Runtime target exe",
+ "Runtime target exe:",
"NumTeams=" + std::to_string(NumTeams) +
";NumArgs=" + std::to_string(KernelArgs->NumArgs),
Loc);
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index a7d55d7ebd5391..77a648de567a87 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -534,7 +534,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
+ "HostToDev:", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -910,7 +910,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
+ "DevToHost:", "Size=" + std::to_string(DataSize) + "B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1636,7 +1636,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "Kernel Target",
+ "Kernel Target:",
"NumArguments=" + std::to_string(KernelArgs.NumArgs) +
";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
";TripCount=" + std::to_string(KernelArgs.Tripcount),
>From d56e4b27f606bcb8cdf6b8ec5af20b80b8ad0d5b Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Fri, 22 Dec 2023 19:06:04 +0000
Subject: [PATCH 14/14] Rebase fix features
---
openmp/libomptarget/include/Shared/Profile.h | 12 ++++++++++++
openmp/libomptarget/src/interface.cpp | 11 ++++++-----
openmp/libomptarget/src/omptarget.cpp | 6 +++---
3 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/openmp/libomptarget/include/Shared/Profile.h b/openmp/libomptarget/include/Shared/Profile.h
index 19ca0cf2275186..7e580988a39baf 100644
--- a/openmp/libomptarget/include/Shared/Profile.h
+++ b/openmp/libomptarget/include/Shared/Profile.h
@@ -97,4 +97,16 @@ class Profiler {
std::string RTM = RegionTypeMsg; \
llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+/// Time spend in the current scope, assigned to the regionType
+/// with details from runtime
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
+ SourceInfo SI(IDENT); \
+ std::string ProfileLocation = SI.getProfileLocation(); \
+ llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+
+/// Time spend in the current scope, assigned to the function name and source
+/// with details
+#define TIMESCOPE_WITH_DETAILS(Details) \
+ llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
+
#endif // OMPTARGET_SHARED_PROFILE_H
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 86c95416c41db4..61d9db17f51006 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime Data Copy:",
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
"NumArgs=" + std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -263,7 +263,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "Runtime target exe:",
+ "Runtime: target exe",
"NumTeams=" + std::to_string(NumTeams) +
";NumArgs=" + std::to_string(KernelArgs->NumArgs),
Loc);
@@ -297,9 +297,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
-
- if (Rc == OFFLOAD_SUCCESS)
- Rc = AsyncInfo.synchronize();
+ { // required to show syncronization
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
+ if (Rc == OFFLOAD_SUCCESS)
+ Rc = AsyncInfo.synchronize();
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 77a648de567a87..a7d55d7ebd5391 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -534,7 +534,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "HostToDev:", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
+ "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -910,7 +910,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "DevToHost:", "Size=" + std::to_string(DataSize) + "B", Loc);
+ "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1636,7 +1636,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "Kernel Target:",
+ "Kernel Target",
"NumArguments=" + std::to_string(KernelArgs.NumArgs) +
";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
";TripCount=" + std::to_string(KernelArgs.Tripcount),
More information about the Openmp-commits
mailing list