[flang-commits] [lldb] [mlir] [libunwind] [libcxx] [clang] [clang-tools-extra] [openmp] [lld] [libc] [llvm] [compiler-rt] [flang] [OpenMP] Improve omp offload profiler (PR #68016)
Felipe Cabarcas via flang-commits
flang-commits at lists.llvm.org
Fri Dec 22 07:15:41 PST 2023
https://github.com/fel-cab updated https://github.com/llvm/llvm-project/pull/68016
>From dd44de067c26ba94b6561c5ed7fa4a5d812a3d1a Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Mon, 18 Sep 2023 12:07:12 +0000
Subject: [PATCH 01/14] testing Profiler features
---
openmp/libomptarget/src/interface.cpp | 5 ++++-
openmp/libomptarget/src/private.h | 3 ++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 5f21b16b3fbfb1..f64e1e268a3952 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -252,7 +252,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_IDENT(Loc);
+ //TIMESCOPE_WITH_IDENT(Loc);
+ TIMESCOPE();
+ //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
+ //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index cbce15b63a3eba..dc6cd394423395 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -433,7 +433,8 @@ class ExponentialBackoff {
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
std::string RTM = RegionTypeMsg; \
- llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+ llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
+ //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
#else
#define TIMESCOPE()
#define TIMESCOPE_WITH_IDENT(IDENT)
>From 92586bca6364100c7511ad38a30f41b0f86dea9c Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 12:02:53 +0000
Subject: [PATCH 02/14] Improve Profiler 1
---
llvm/lib/Support/TimeProfiler.cpp | 2 +-
openmp/libomptarget/src/interface.cpp | 17 +++++++++--------
openmp/libomptarget/src/omptarget.cpp | 10 +++++-----
openmp/libomptarget/src/private.h | 5 +++--
4 files changed, 18 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4d625b3eb5b170..e1458116f64ab4 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -227,7 +227,7 @@ struct llvm::TimeTraceProfiler {
J.attribute("ph", "X");
J.attribute("ts", 0);
J.attribute("dur", DurUs);
- J.attribute("name", "Total " + Total.first);
+ J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
J.attribute("avg ms", int64_t(DurUs / Count / 1000));
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index f64e1e268a3952..b8892cbe689107 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,14 @@ using namespace llvm::omp::target::ompt;
////////////////////////////////////////////////////////////////////////////////
/// adds requires flags
EXTERN void __tgt_register_requires(int64_t Flags) {
- TIMESCOPE();
+ //TIMESCOPE();
PM->RTLs.registerRequires(Flags);
}
////////////////////////////////////////////////////////////////////////////////
/// adds a target shared library to the target execution image
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
- TIMESCOPE();
+ //TIMESCOPE();
if (PM->maybeDelayRegisterLib(Desc))
return;
@@ -61,7 +61,7 @@ EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); }
////////////////////////////////////////////////////////////////////////////////
/// unloads a target shared library
EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
- TIMESCOPE();
+ //TIMESCOPE();
PM->RTLs.unregisterLib(Desc);
for (auto &RTL : PM->RTLs.UsedRTLs) {
if (RTL->unregister_lib) {
@@ -82,7 +82,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+ //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+ TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -253,9 +254,9 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
//TIMESCOPE_WITH_IDENT(Loc);
- TIMESCOPE();
+ //TIMESCOPE();
//TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
- //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
+ //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
@@ -411,7 +412,7 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
// Get the current number of components for a user-defined mapper.
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
- TIMESCOPE();
+ //TIMESCOPE();
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
int64_t Size = MapperComponentsPtr->Components.size();
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -423,7 +424,7 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
void *Begin, int64_t Size, int64_t Type,
void *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("__tgt_push_mapper_component(Handle=" DPxMOD
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 40419e44894260..3754f63909dac9 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -398,7 +398,7 @@ static int32_t getParentIndex(int64_t Type) {
void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -427,7 +427,7 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));
@@ -453,7 +453,7 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -493,7 +493,7 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
}
void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
- TIMESCOPE();
+ //TIMESCOPE();
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
DeviceTy *DevicePtr = nullptr;
@@ -572,7 +572,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
bool FromMapper) {
- TIMESCOPE_WITH_IDENT(Loc);
+ //TIMESCOPE_WITH_IDENT(Loc);
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
// Ignore private variables and arrays - there is no mapping for them.
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index dc6cd394423395..b1ada09d64c7a5 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -433,8 +433,9 @@ class ExponentialBackoff {
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
std::string RTM = RegionTypeMsg; \
- llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
- //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+ llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+ //llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
+
#else
#define TIMESCOPE()
#define TIMESCOPE_WITH_IDENT(IDENT)
>From f9167dc8fef277ac1aa53e2e95bade3f0b727df1 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 21:33:24 +0000
Subject: [PATCH 03/14] Changed profiling to work in nanoseconds. Made
Profiling calls for runtime calls and different ones for kernel lunches and
memory transfers.
---
llvm/lib/Support/TimeProfiler.cpp | 28 +++++++++++++--------------
openmp/libomptarget/src/interface.cpp | 7 ++-----
openmp/libomptarget/src/omptarget.cpp | 11 +++++++----
openmp/libomptarget/src/private.h | 6 +++---
4 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index e1458116f64ab4..64b3ef35be27c4 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -30,7 +30,7 @@ namespace {
using std::chrono::duration;
using std::chrono::duration_cast;
-using std::chrono::microseconds;
+using std::chrono::nanoseconds;
using std::chrono::steady_clock;
using std::chrono::system_clock;
using std::chrono::time_point;
@@ -80,14 +80,14 @@ struct TimeTraceProfilerEntry {
// rather than casting duration. This avoids truncation issues causing inner
// scopes overruning outer scopes.
ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
- return (time_point_cast<microseconds>(Start) -
- time_point_cast<microseconds>(StartTime))
+ return (time_point_cast<nanoseconds>(Start) -
+ time_point_cast<nanoseconds>(StartTime))
.count();
}
ClockType::rep getFlameGraphDurUs() const {
- return (time_point_cast<microseconds>(End) -
- time_point_cast<microseconds>(Start))
+ return (time_point_cast<nanoseconds>(End) -
+ time_point_cast<nanoseconds>(Start))
.count();
}
};
@@ -123,7 +123,7 @@ struct llvm::TimeTraceProfiler {
DurationType Duration = E.End - E.Start;
// Only include sections longer or equal to TimeTraceGranularity msec.
- if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
+ if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity)
Entries.emplace_back(E);
// Track total time taken by each "name", but only the topmost levels of
@@ -169,8 +169,8 @@ struct llvm::TimeTraceProfiler {
J.attribute("pid", Pid);
J.attribute("tid", int64_t(Tid));
J.attribute("ph", "X");
- J.attribute("ts", StartUs);
- J.attribute("dur", DurUs);
+ J.attribute("ts", StartUs / 1000);
+ J.attribute("dur", DurUs / 1000);
J.attribute("name", E.Name);
if (!E.Detail.empty()) {
J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
// Report totals on separate threads of tracing file.
uint64_t TotalTid = MaxTid + 1;
for (const NameAndCountAndDurationType &Total : SortedTotals) {
- auto DurUs = duration_cast<microseconds>(Total.second.second).count();
+ auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
auto Count = AllCountAndTotalPerName[Total.first].first;
J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurUs);
+ J.attribute("dur", DurUs / 1000);
J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
- J.attribute("avg ms", int64_t(DurUs / Count / 1000));
+ J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
});
});
@@ -262,9 +262,9 @@ struct llvm::TimeTraceProfiler {
// This can be used to combine the profiling data from
// multiple processes and preserve actual time intervals.
J.attribute("beginningOfTime",
- time_point_cast<microseconds>(BeginningOfTime)
+ time_point_cast<nanoseconds>(BeginningOfTime)
.time_since_epoch()
- .count());
+ .count()/1000);
J.objectEnd();
}
@@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler {
SmallString<0> ThreadName;
const uint64_t Tid;
- // Minimum time granularity (in microseconds)
+ // Minimum time granularity (in nanoseconds)
const unsigned TimeTraceGranularity;
};
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index b8892cbe689107..d4ee246f84449f 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -83,7 +83,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
//TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
- TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
+ TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -253,10 +253,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
- //TIMESCOPE_WITH_IDENT(Loc);
- //TIMESCOPE();
- //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
- //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
+ TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 3754f63909dac9..ad966e7e1c4754 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -863,6 +863,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
+ //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
int Ret = OFFLOAD_SUCCESS;
auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
// process each input.
@@ -955,7 +956,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-
+ TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1445,7 +1446,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
SmallVector<ptrdiff_t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
+ //TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
DeviceTy &Device = *PM->Devices[DeviceId];
int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, AsyncInfo);
@@ -1493,6 +1494,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
DPxPTR(HstPtrVal));
continue;
}
+ TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
@@ -1572,7 +1574,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
+ //TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
DeviceTy &Device = *PM->Devices[DeviceId];
// Move data from device.
@@ -1597,6 +1599,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
return Ret;
});
+
return OFFLOAD_SUCCESS;
}
} // namespace
@@ -1672,7 +1675,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
- TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
+ TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index b1ada09d64c7a5..f0591cd17b0fd1 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -432,10 +432,10 @@ class ExponentialBackoff {
#define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT) \
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
+ std::string ProfileName = SI.getName(); \
std::string RTM = RegionTypeMsg; \
- llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
- //llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
-
+ llvm::TimeTraceScope TimeScope(ProfileName, ProfileLocation + RTM)
+ //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
#else
#define TIMESCOPE()
#define TIMESCOPE_WITH_IDENT(IDENT)
>From c82ce52f244d218752fea2dcc1f347fc589cd016 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Thu, 21 Sep 2023 14:22:28 +0000
Subject: [PATCH 04/14] test with DevToHost
---
openmp/libomptarget/src/omptarget.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index ad966e7e1c4754..e113942375ef9c 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -956,7 +956,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
+ std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
+ TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
>From 448f0e77b6c824de73cbd9ae34d4c59b02e7e441 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Fri, 22 Sep 2023 21:48:57 +0000
Subject: [PATCH 05/14] Fixing nanoseconds in totals, adding syncronize
timings, and adding extra info in kernels and device
---
llvm/lib/Support/TimeProfiler.cpp | 24 ++++++++++++------------
openmp/libomptarget/src/interface.cpp | 18 ++++++++++--------
openmp/libomptarget/src/omptarget.cpp | 19 +++++++++----------
openmp/libomptarget/src/private.h | 10 +++++++---
4 files changed, 38 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 64b3ef35be27c4..4446583102a813 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -79,13 +79,13 @@ struct TimeTraceProfilerEntry {
// Calculate timings for FlameGraph. Cast time points to microsecond precision
// rather than casting duration. This avoids truncation issues causing inner
// scopes overruning outer scopes.
- ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
+ ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const {
return (time_point_cast<nanoseconds>(Start) -
time_point_cast<nanoseconds>(StartTime))
.count();
}
- ClockType::rep getFlameGraphDurUs() const {
+ ClockType::rep getFlameGraphDurNs() const {
return (time_point_cast<nanoseconds>(End) -
time_point_cast<nanoseconds>(Start))
.count();
@@ -114,9 +114,9 @@ struct llvm::TimeTraceProfiler {
// Check that end times monotonically increase.
assert((Entries.empty() ||
- (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
- Entries.back().getFlameGraphStartUs(StartTime) +
- Entries.back().getFlameGraphDurUs())) &&
+ (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >=
+ Entries.back().getFlameGraphStartNs(StartTime) +
+ Entries.back().getFlameGraphDurNs())) &&
"TimeProfiler scope ended earlier than previous scope");
// Calculate duration at full precision for overall counts.
@@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler {
// Emit all events for the main flame graph.
auto writeEvent = [&](const auto &E, uint64_t Tid) {
- auto StartUs = E.getFlameGraphStartUs(StartTime);
- auto DurUs = E.getFlameGraphDurUs();
+ auto StartNs = E.getFlameGraphStartNs(StartTime);
+ auto DurNs = E.getFlameGraphDurNs();
J.object([&] {
J.attribute("pid", Pid);
J.attribute("tid", int64_t(Tid));
J.attribute("ph", "X");
- J.attribute("ts", StartUs / 1000);
- J.attribute("dur", DurUs / 1000);
+ J.attribute("ts", StartNs / 1000);
+ J.attribute("dur", DurNs / 1000);
J.attribute("name", E.Name);
if (!E.Detail.empty()) {
J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
// Report totals on separate threads of tracing file.
uint64_t TotalTid = MaxTid + 1;
for (const NameAndCountAndDurationType &Total : SortedTotals) {
- auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
+ auto DurNs = duration_cast<nanoseconds>(Total.second.second).count();
auto Count = AllCountAndTotalPerName[Total.first].first;
J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurUs / 1000);
+ J.attribute("dur", DurNs / 1000 );
J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
- J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
+ J.attribute("avg us", int64_t(DurNs / Count / 1000));
});
});
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d4ee246f84449f..bed9b1e40db455 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -83,7 +83,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
//TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
- TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -252,9 +252,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
KernelArgsTy *KernelArgs) {
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-
- TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
-
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
DeviceId, DPxPTR(HostPtr));
@@ -279,7 +276,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
+ "NumTeams="+std::to_string(NumTeams)+
+ ";NumArgs="+std::to_string(KernelArgs->NumArgs)
+ , Loc);
+
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -303,16 +304,17 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
/* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
+
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
-
+ {
+ TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc);
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-
+ }
return OMP_TGT_SUCCESS;
}
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e113942375ef9c..5f6168b0bd2fca 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -579,7 +579,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -863,7 +863,6 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
- //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
int Ret = OFFLOAD_SUCCESS;
auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
// process each input.
@@ -956,8 +955,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
- TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1447,7 +1445,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
SmallVector<ptrdiff_t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- //TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
DeviceTy &Device = *PM->Devices[DeviceId];
int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, AsyncInfo);
@@ -1494,8 +1491,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
"variable (" DPxMOD ")\n",
DPxPTR(HstPtrVal));
continue;
- }
- TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
+ }
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
@@ -1575,7 +1571,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- //TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
DeviceTy &Device = *PM->Devices[DeviceId];
// Move data from device.
@@ -1676,8 +1671,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
- TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
+ "NumArguments="+std::to_string(KernelArgs.NumArgs)
+ +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+ +";TripCount="+std::to_string(KernelArgs.Tripcount)
+ , Loc);
+
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
"Multi dimensional launch not supported yet.");
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index f0591cd17b0fd1..4bc1db79de3f2b 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -432,14 +432,18 @@ class ExponentialBackoff {
#define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT) \
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
- std::string ProfileName = SI.getName(); \
+ std::string ProfileName = SI.getName(); \
std::string RTM = RegionTypeMsg; \
- llvm::TimeTraceScope TimeScope(ProfileName, ProfileLocation + RTM)
+ llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
//llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
+ SourceInfo SI(IDENT); \
+ std::string ProfileLocation = SI.getProfileLocation(); \
+ llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
#else
#define TIMESCOPE()
#define TIMESCOPE_WITH_IDENT(IDENT)
#define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT)
#define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT)
-
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)
#endif
>From c8bb24e807324a6a42b50076e5a3d2159f1d6d74 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 15:58:50 +0000
Subject: [PATCH 06/14] Some fixes to the profiler
---
openmp/libomptarget/src/api.cpp | 7 +++++++
openmp/libomptarget/src/interface.cpp | 16 +++++-----------
openmp/libomptarget/src/omptarget.cpp | 5 -----
3 files changed, 12 insertions(+), 16 deletions(-)
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 942df8fdb94d66..f628a64c5b69fa 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -50,6 +50,7 @@ EXTERN int omp_get_initial_device(void) {
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
+ TIMESCOPE();
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -66,6 +67,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
}
EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
+ TIMESCOPE();
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -134,6 +136,11 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
TIMESCOPE();
+ /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
+ "NumArguments="+std::to_string(KernelArgs.NumArgs)
+ +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+ +";TripCount="+std::to_string(KernelArgs.Tripcount)
+ , __FUNCTION__);*/
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index bed9b1e40db455..61a340ccf8d1b1 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
////////////////////////////////////////////////////////////////////////////////
/// adds requires flags
EXTERN void __tgt_register_requires(int64_t Flags) {
- //TIMESCOPE();
PM->RTLs.registerRequires(Flags);
}
////////////////////////////////////////////////////////////////////////////////
/// adds a target shared library to the target execution image
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
- //TIMESCOPE();
if (PM->maybeDelayRegisterLib(Desc))
return;
@@ -61,7 +59,6 @@ EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); }
////////////////////////////////////////////////////////////////////////////////
/// unloads a target shared library
EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
- //TIMESCOPE();
PM->RTLs.unregisterLib(Desc);
for (auto &RTL : PM->RTLs.UsedRTLs) {
if (RTL->unregister_lib) {
@@ -82,7 +79,6 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -307,13 +303,13 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
- {
+ { //required to show syncronization
TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc);
- if (Rc == OFFLOAD_SUCCESS)
- Rc = AsyncInfo.synchronize();
+ if (Rc == OFFLOAD_SUCCESS)
+ Rc = AsyncInfo.synchronize();
- handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
- assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
+ handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+ assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
}
return OMP_TGT_SUCCESS;
}
@@ -411,7 +407,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
// Get the current number of components for a user-defined mapper.
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
- //TIMESCOPE();
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
int64_t Size = MapperComponentsPtr->Components.size();
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -423,7 +418,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
void *Begin, int64_t Size, int64_t Type,
void *Name) {
- //TIMESCOPE();
DP("__tgt_push_mapper_component(Handle=" DPxMOD
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 5f6168b0bd2fca..450f34894fb56b 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -398,7 +398,6 @@ static int32_t getParentIndex(int64_t Type) {
void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -427,7 +426,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));
@@ -453,7 +451,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -493,7 +490,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
}
void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
- //TIMESCOPE();
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
DeviceTy *DevicePtr = nullptr;
@@ -572,7 +568,6 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
bool FromMapper) {
- //TIMESCOPE_WITH_IDENT(Loc);
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
// Ignore private variables and arrays - there is no mapping for them.
>From da71cf17918c56e6a64c1e966dbb5d0dd79d0ed9 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 21:06:06 +0000
Subject: [PATCH 07/14] Adding information to some omp api calls
---
openmp/libomptarget/src/api.cpp | 22 ++++++++++++----------
openmp/libomptarget/src/private.h | 7 +++++--
2 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index f628a64c5b69fa..5dd91880849299 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -50,7 +50,8 @@ EXTERN int omp_get_initial_device(void) {
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum)
+ +";size="+std::to_string(Size));
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -135,12 +136,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
- TIMESCOPE();
- /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
- "NumArguments="+std::to_string(KernelArgs.NumArgs)
- +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
- +";TripCount="+std::to_string(KernelArgs.Tripcount)
- , __FUNCTION__);*/
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+ +";src_dev="+std::to_string(SrcDevice)
+ +";size="+std::to_string(Length));
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -293,7 +291,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+ +";src_dev="+std::to_string(SrcDevice)
+ +";size="+std::to_string(Length));
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -321,7 +321,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions,
int DstDevice, int SrcDevice) {
- TIMESCOPE();
DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -380,7 +379,10 @@ EXTERN int omp_target_memcpy_rect_async(
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+ +";src_dev="+std::to_string(SrcDevice)
+ +";size="+std::to_string(ElementSize)
+ +";num_dims="+std::to_string(NumDims));
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 4bc1db79de3f2b..c8d07138b180d1 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -434,16 +434,19 @@ class ExponentialBackoff {
std::string ProfileLocation = SI.getProfileLocation(); \
std::string ProfileName = SI.getName(); \
std::string RTM = RegionTypeMsg; \
- llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
- //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+ llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+ //llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+#define TIMESCOPE_WITH_DETAILS(Details) \
+ llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
#else
#define TIMESCOPE()
#define TIMESCOPE_WITH_IDENT(IDENT)
#define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT)
#define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT)
#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)
+#define TIMESCOPE_WITH_DETAILS(Details)
#endif
>From f273bbcc66f361fe9cc03d8597ee886122b5e235 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 12:26:51 +0000
Subject: [PATCH 08/14] Adding information to the LIBOMPTARGET profiler runtime
kernel and API calls.
---
openmp/libomptarget/src/interface.cpp | 14 ++++++++------
openmp/libomptarget/src/omptarget.cpp | 24 +++++++++++++++---------
2 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 61a340ccf8d1b1..99a7abc7e0bcee 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -79,7 +79,9 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+ "NumArgs="+
+ std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -273,10 +275,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
- "NumTeams="+std::to_string(NumTeams)+
- ";NumArgs="+std::to_string(KernelArgs->NumArgs)
- , Loc);
-
+ "NumTeams="+std::to_string(NumTeams)+
+ ";NumArgs="+
+ std::to_string(KernelArgs->NumArgs), Loc);
+
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -300,7 +302,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
/* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
+
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
{ //required to show syncronization
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 450f34894fb56b..b5a2dfc6856908 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -574,7 +574,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
- TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev",
+ "Size="+
+ std::to_string(ArgSizes[I])+
+ "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -950,7 +953,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+
+ std::to_string(DataSize)+"B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1486,7 +1490,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
"variable (" DPxMOD ")\n",
DPxPTR(HstPtrVal));
continue;
- }
+ }
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
@@ -1590,7 +1594,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
return Ret;
});
-
return OFFLOAD_SUCCESS;
}
} // namespace
@@ -1667,11 +1670,14 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
- "NumArguments="+std::to_string(KernelArgs.NumArgs)
- +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
- +";TripCount="+std::to_string(KernelArgs.Tripcount)
- , Loc);
-
+ "NumArguments="+
+ std::to_string(KernelArgs.NumArgs)+
+ ";NumTeams="+
+ std::to_string(KernelArgs.NumTeams[0])+
+ ";TripCount="+
+ std::to_string(KernelArgs.Tripcount)
+ , Loc);
+
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
"Multi dimensional launch not supported yet.");
>From 08dbdd5ba1c0502b6d1c935bac6cc14acd4f04be Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 19:14:01 +0000
Subject: [PATCH 09/14] Fixing format
---
llvm/lib/Support/TimeProfiler.cpp | 10 +++----
openmp/libomptarget/src/api.cpp | 24 ++++++++---------
openmp/libomptarget/src/interface.cpp | 14 +++++-----
openmp/libomptarget/src/omptarget.cpp | 39 ++++++++++++---------------
openmp/libomptarget/src/private.h | 2 +-
5 files changed, 42 insertions(+), 47 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4446583102a813..330a4d93378aff 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -226,7 +226,7 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurNs / 1000 );
+ J.attribute("dur", DurNs / 1000);
J.attribute("name", "Total: " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
@@ -261,10 +261,10 @@ struct llvm::TimeTraceProfiler {
// Emit the absolute time when this TimeProfiler started.
// This can be used to combine the profiling data from
// multiple processes and preserve actual time intervals.
- J.attribute("beginningOfTime",
- time_point_cast<nanoseconds>(BeginningOfTime)
- .time_since_epoch()
- .count()/1000);
+ J.attribute("beginningOfTime", time_point_cast<nanoseconds>(BeginningOfTime)
+ .time_since_epoch()
+ .count() /
+ 1000);
J.objectEnd();
}
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 5dd91880849299..06de1f8f20b7ae 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -50,8 +50,8 @@ EXTERN int omp_get_initial_device(void) {
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum)
- +";size="+std::to_string(Size));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
+ ";size=" + std::to_string(Size));
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -136,9 +136,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
- +";src_dev="+std::to_string(SrcDevice)
- +";size="+std::to_string(Length));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -291,9 +291,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
- +";src_dev="+std::to_string(SrcDevice)
- +";size="+std::to_string(Length));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -379,10 +379,10 @@ EXTERN int omp_target_memcpy_rect_async(
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
- +";src_dev="+std::to_string(SrcDevice)
- +";size="+std::to_string(ElementSize)
- +";num_dims="+std::to_string(NumDims));
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(ElementSize) +
+ ";num_dims=" + std::to_string(NumDims));
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 99a7abc7e0bcee..2c7ab7a49d0bfb 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -80,8 +80,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
- "NumArgs="+
- std::to_string(ArgNum), Loc);
+ "NumArgs=" + std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -274,10 +273,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
- "NumTeams="+std::to_string(NumTeams)+
- ";NumArgs="+
- std::to_string(KernelArgs->NumArgs), Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "Runtime target exe",
+ "NumTeams=" + std::to_string(NumTeams) +
+ ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
+ Loc);
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
@@ -305,7 +305,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
- { //required to show syncronization
+ { // required to show syncronization
TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc);
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index b5a2dfc6856908..277f95d7efa820 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -574,10 +574,8 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
- TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev",
- "Size="+
- std::to_string(ArgSizes[I])+
- "B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -825,14 +823,13 @@ postProcessingTargetDataEnd(DeviceTy *Device,
// remaining shadow pointer entries for this struct.
const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
if (HasFrom) {
- Entry->foreachShadowPointerInfo(
- [&](const ShadowPtrInfoTy &ShadowPtr) {
- *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
- DP("Restoring original host pointer value " DPxMOD " for host "
- "pointer " DPxMOD "\n",
- DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
- return OFFLOAD_SUCCESS;
- });
+ Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
+ *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
+ DP("Restoring original host pointer value " DPxMOD " for host "
+ "pointer " DPxMOD "\n",
+ DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+ return OFFLOAD_SUCCESS;
+ });
}
// Give up the lock as we either don't need it anymore (e.g., done with
@@ -953,8 +950,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
- TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+
- std::to_string(DataSize)+"B", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1669,14 +1666,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
- "NumArguments="+
- std::to_string(KernelArgs.NumArgs)+
- ";NumTeams="+
- std::to_string(KernelArgs.NumTeams[0])+
- ";TripCount="+
- std::to_string(KernelArgs.Tripcount)
- , Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "Kernel Target",
+ "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
+ ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
+ ";TripCount=" + std::to_string(KernelArgs.Tripcount),
+ Loc);
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index c8d07138b180d1..8657390dde17dc 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -435,7 +435,7 @@ class ExponentialBackoff {
std::string ProfileName = SI.getName(); \
std::string RTM = RegionTypeMsg; \
llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
- //llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
+// llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
>From 536f58d0b8639fbccb2467634bf52ab4e737c121 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 20:51:40 +0000
Subject: [PATCH 10/14] Change of lld/test/MachO/map-file.s: adding : to the
test check, given that the profile added the colons to make the total more
clear in the trace
---
lld/test/MachO/map-file.s | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index aa9fff9938eb28..279a15b8e33e60 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -89,7 +89,7 @@
# CHECK-NEXT: 0x[[#%X,BSS]] 0x00000001 [ 2] _number
# CHECK-EMPTY:
-# MAPFILE: "name":"Total Write map file"
+# MAPFILE: "name":"Total: Write map file"
# RUN: %lld -demangle -dead_strip -map %t/stripped-map %t/test.o -force_load \
# RUN: %t/libfoo.a %t/c-string-literal.o %t/libbaz.dylib -o %t/stripped
>From 089c0adb5f120a722c576f3feb8d000e621cfc84 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Tue, 3 Oct 2023 12:10:13 +0000
Subject: [PATCH 11/14] Removing TimeProfiler microsecond to nanosecond change
from this PR, it will be a separate PR
---
llvm/lib/Support/TimeProfiler.cpp | 48 +++++++++++++++----------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 330a4d93378aff..4d625b3eb5b170 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -30,7 +30,7 @@ namespace {
using std::chrono::duration;
using std::chrono::duration_cast;
-using std::chrono::nanoseconds;
+using std::chrono::microseconds;
using std::chrono::steady_clock;
using std::chrono::system_clock;
using std::chrono::time_point;
@@ -79,15 +79,15 @@ struct TimeTraceProfilerEntry {
// Calculate timings for FlameGraph. Cast time points to microsecond precision
// rather than casting duration. This avoids truncation issues causing inner
// scopes overruning outer scopes.
- ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const {
- return (time_point_cast<nanoseconds>(Start) -
- time_point_cast<nanoseconds>(StartTime))
+ ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
+ return (time_point_cast<microseconds>(Start) -
+ time_point_cast<microseconds>(StartTime))
.count();
}
- ClockType::rep getFlameGraphDurNs() const {
- return (time_point_cast<nanoseconds>(End) -
- time_point_cast<nanoseconds>(Start))
+ ClockType::rep getFlameGraphDurUs() const {
+ return (time_point_cast<microseconds>(End) -
+ time_point_cast<microseconds>(Start))
.count();
}
};
@@ -114,16 +114,16 @@ struct llvm::TimeTraceProfiler {
// Check that end times monotonically increase.
assert((Entries.empty() ||
- (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >=
- Entries.back().getFlameGraphStartNs(StartTime) +
- Entries.back().getFlameGraphDurNs())) &&
+ (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
+ Entries.back().getFlameGraphStartUs(StartTime) +
+ Entries.back().getFlameGraphDurUs())) &&
"TimeProfiler scope ended earlier than previous scope");
// Calculate duration at full precision for overall counts.
DurationType Duration = E.End - E.Start;
// Only include sections longer or equal to TimeTraceGranularity msec.
- if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity)
+ if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
Entries.emplace_back(E);
// Track total time taken by each "name", but only the topmost levels of
@@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler {
// Emit all events for the main flame graph.
auto writeEvent = [&](const auto &E, uint64_t Tid) {
- auto StartNs = E.getFlameGraphStartNs(StartTime);
- auto DurNs = E.getFlameGraphDurNs();
+ auto StartUs = E.getFlameGraphStartUs(StartTime);
+ auto DurUs = E.getFlameGraphDurUs();
J.object([&] {
J.attribute("pid", Pid);
J.attribute("tid", int64_t(Tid));
J.attribute("ph", "X");
- J.attribute("ts", StartNs / 1000);
- J.attribute("dur", DurNs / 1000);
+ J.attribute("ts", StartUs);
+ J.attribute("dur", DurUs);
J.attribute("name", E.Name);
if (!E.Detail.empty()) {
J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
// Report totals on separate threads of tracing file.
uint64_t TotalTid = MaxTid + 1;
for (const NameAndCountAndDurationType &Total : SortedTotals) {
- auto DurNs = duration_cast<nanoseconds>(Total.second.second).count();
+ auto DurUs = duration_cast<microseconds>(Total.second.second).count();
auto Count = AllCountAndTotalPerName[Total.first].first;
J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
J.attribute("tid", int64_t(TotalTid));
J.attribute("ph", "X");
J.attribute("ts", 0);
- J.attribute("dur", DurNs / 1000);
- J.attribute("name", "Total: " + Total.first);
+ J.attribute("dur", DurUs);
+ J.attribute("name", "Total " + Total.first);
J.attributeObject("args", [&] {
J.attribute("count", int64_t(Count));
- J.attribute("avg us", int64_t(DurNs / Count / 1000));
+ J.attribute("avg ms", int64_t(DurUs / Count / 1000));
});
});
@@ -261,10 +261,10 @@ struct llvm::TimeTraceProfiler {
// Emit the absolute time when this TimeProfiler started.
// This can be used to combine the profiling data from
// multiple processes and preserve actual time intervals.
- J.attribute("beginningOfTime", time_point_cast<nanoseconds>(BeginningOfTime)
- .time_since_epoch()
- .count() /
- 1000);
+ J.attribute("beginningOfTime",
+ time_point_cast<microseconds>(BeginningOfTime)
+ .time_since_epoch()
+ .count());
J.objectEnd();
}
@@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler {
SmallString<0> ThreadName;
const uint64_t Tid;
- // Minimum time granularity (in nanoseconds)
+ // Minimum time granularity (in microseconds)
const unsigned TimeTraceGranularity;
};
>From 61c0adbccd26eb416322134acf69f360d4d5d7d6 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Tue, 3 Oct 2023 14:09:18 +0000
Subject: [PATCH 12/14] restoring lld/test/MachO/map-file.s test, because the
change was necesary for the changes to the llvm/lib/Support/TimeProfiler.cpp,
which was removed from this PR
---
lld/test/MachO/map-file.s | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index 279a15b8e33e60..aa9fff9938eb28 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -89,7 +89,7 @@
# CHECK-NEXT: 0x[[#%X,BSS]] 0x00000001 [ 2] _number
# CHECK-EMPTY:
-# MAPFILE: "name":"Total: Write map file"
+# MAPFILE: "name":"Total Write map file"
# RUN: %lld -demangle -dead_strip -map %t/stripped-map %t/test.o -force_load \
# RUN: %t/libfoo.a %t/c-string-literal.o %t/libbaz.dylib -o %t/stripped
>From a43813748a33a310fb014c336d747940c789b391 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Wed, 11 Oct 2023 14:19:30 +0000
Subject: [PATCH 13/14] Removing commented line and setting
TimeTraceGranularity to 0us to record any anotated event
---
openmp/libomptarget/src/private.h | 1 -
openmp/libomptarget/src/rtl.cpp | 2 +-
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 8657390dde17dc..71f42584249b17 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -435,7 +435,6 @@ class ExponentialBackoff {
std::string ProfileName = SI.getName(); \
std::string RTM = RegionTypeMsg; \
llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
-// llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
index fdedf2ee456acb..a45f558248b2db 100644
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -66,7 +66,7 @@ __attribute__((constructor(101))) void init() {
ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
// TODO: add a configuration option for time granularity
if (ProfileTraceFile)
- timeTraceProfilerInitialize(500 /* us */, "libomptarget");
+ timeTraceProfilerInitialize(0 /* us */, "libomptarget");
#ifdef OMPT_SUPPORT
// Initialize OMPT first
>From 53bcadb43216231cd61a081071817c6a2b6094e4 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Fri, 27 Oct 2023 20:58:26 +0000
Subject: [PATCH 14/14] Adding kernel function name to the slice name
---
openmp/libomptarget/src/interface.cpp | 4 ++--
openmp/libomptarget/src/omptarget.cpp | 6 +++---
openmp/libomptarget/src/private.h | 4 +++-
3 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 3f8037cac8ab10..2842730637f444 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -79,7 +79,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime Data Copy:",
"NumArgs=" + std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -274,7 +274,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "Runtime target exe",
+ "Runtime target exe:",
"NumTeams=" + std::to_string(NumTeams) +
";NumArgs=" + std::to_string(KernelArgs->NumArgs),
Loc);
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 277f95d7efa820..f49885ba6fcd84 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -575,7 +575,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
+ "HostToDev:", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -951,7 +951,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
+ "DevToHost:", "Size=" + std::to_string(DataSize) + "B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1667,7 +1667,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
- "Kernel Target",
+ "Kernel Target:",
"NumArguments=" + std::to_string(KernelArgs.NumArgs) +
";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
";TripCount=" + std::to_string(KernelArgs.Tripcount),
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 47f5082edbad75..9b79da29cd17f0 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -449,7 +449,9 @@ class ExponentialBackoff {
#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
SourceInfo SI(IDENT); \
std::string ProfileLocation = SI.getProfileLocation(); \
- llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+ std::string kernelName = SI.getName(); \
+ llvm::TimeTraceScope TimeScope(RegionTypeMsg+kernelName, \
+ ProfileLocation + Details)
#define TIMESCOPE_WITH_DETAILS(Details) \
llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
#else
More information about the flang-commits
mailing list