[Openmp-commits] [lld] [llvm] [openmp] [OpenMP] Improve omp offload profiler (PR #68016)

Felipe Cabarcas via Openmp-commits openmp-commits at lists.llvm.org
Fri Dec 22 11:10:48 PST 2023


https://github.com/fel-cab updated https://github.com/llvm/llvm-project/pull/68016

>From 16e38453b6b9066824020829b1e22ab44dd5706b Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Mon, 18 Sep 2023 12:07:12 +0000
Subject: [PATCH 01/14] testing Profiler features

---
 openmp/libomptarget/src/interface.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d9e87640161f26..8b9f146695bb63 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -241,7 +241,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_IDENT(Loc);
+  //TIMESCOPE_WITH_IDENT(Loc);
+  TIMESCOPE();
+  //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
+  //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
 
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",

>From 0f850824541175eae650f0eac87a6425d5275b88 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 12:02:53 +0000
Subject: [PATCH 02/14] Improve Profiler 1

---
 llvm/lib/Support/TimeProfiler.cpp     |  2 +-
 openmp/libomptarget/src/interface.cpp | 14 ++++++--------
 openmp/libomptarget/src/omptarget.cpp | 10 +++++-----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4d625b3eb5b170..e1458116f64ab4 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -227,7 +227,7 @@ struct llvm::TimeTraceProfiler {
         J.attribute("ph", "X");
         J.attribute("ts", 0);
         J.attribute("dur", DurUs);
-        J.attribute("name", "Total " + Total.first);
+        J.attribute("name", "Total: " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
           J.attribute("avg ms", int64_t(DurUs / Count / 1000));
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 8b9f146695bb63..b98d33cce22ecb 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
 ////////////////////////////////////////////////////////////////////////////////
 /// adds requires flags
 EXTERN void __tgt_register_requires(int64_t Flags) {
-  TIMESCOPE();
   PM->addRequirements(Flags);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
   if (PM->delayRegisterLib(Desc))
     return;
 
@@ -54,7 +52,6 @@ EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
   PM->unregisterLib(Desc);
 }
 
@@ -68,7 +65,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+  //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+  TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -242,9 +240,9 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
 
   //TIMESCOPE_WITH_IDENT(Loc);
-  TIMESCOPE();
+  //TIMESCOPE();
   //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
-  //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
+  //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
 
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
@@ -405,7 +403,7 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
 
 // Get the current number of components for a user-defined mapper.
 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
-  TIMESCOPE();
+  //TIMESCOPE();
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
   int64_t Size = MapperComponentsPtr->Components.size();
   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -417,7 +415,7 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
                                         void *Begin, int64_t Size, int64_t Type,
                                         void *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("__tgt_push_mapper_component(Handle=" DPxMOD
      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
      ", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e724b2f6db8b5f..90c30d391bd8cb 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -392,7 +392,7 @@ static int32_t getParentIndex(int64_t Type) {
 
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -419,7 +419,7 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
      DPxPTR(DevicePtr));
 
@@ -444,7 +444,7 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
 
 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -471,7 +471,7 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
 }
 
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
 
   auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,7 +531,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
                     bool FromMapper) {
-  TIMESCOPE_WITH_IDENT(Loc);
+  //TIMESCOPE_WITH_IDENT(Loc);
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.

>From 498cb39e8054245db88f9943722386d87ac6e17c Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 21:33:24 +0000
Subject: [PATCH 03/14] Changed profiling to work in nanoseconds. Made
 Profiling calls for runtime calls and different ones for kernel lunches and
 memory transfers.

---
 llvm/lib/Support/TimeProfiler.cpp     | 28 +++++++++++++--------------
 openmp/libomptarget/src/interface.cpp |  7 ++-----
 openmp/libomptarget/src/omptarget.cpp | 10 ++++++----
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index e1458116f64ab4..64b3ef35be27c4 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -30,7 +30,7 @@ namespace {
 
 using std::chrono::duration;
 using std::chrono::duration_cast;
-using std::chrono::microseconds;
+using std::chrono::nanoseconds;
 using std::chrono::steady_clock;
 using std::chrono::system_clock;
 using std::chrono::time_point;
@@ -80,14 +80,14 @@ struct TimeTraceProfilerEntry {
   // rather than casting duration. This avoids truncation issues causing inner
   // scopes overruning outer scopes.
   ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
-    return (time_point_cast<microseconds>(Start) -
-            time_point_cast<microseconds>(StartTime))
+    return (time_point_cast<nanoseconds>(Start) -
+            time_point_cast<nanoseconds>(StartTime))
         .count();
   }
 
   ClockType::rep getFlameGraphDurUs() const {
-    return (time_point_cast<microseconds>(End) -
-            time_point_cast<microseconds>(Start))
+    return (time_point_cast<nanoseconds>(End) -
+            time_point_cast<nanoseconds>(Start))
         .count();
   }
 };
@@ -123,7 +123,7 @@ struct llvm::TimeTraceProfiler {
     DurationType Duration = E.End - E.Start;
 
     // Only include sections longer or equal to TimeTraceGranularity msec.
-    if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
+    if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity)
       Entries.emplace_back(E);
 
     // Track total time taken by each "name", but only the topmost levels of
@@ -169,8 +169,8 @@ struct llvm::TimeTraceProfiler {
         J.attribute("pid", Pid);
         J.attribute("tid", int64_t(Tid));
         J.attribute("ph", "X");
-        J.attribute("ts", StartUs);
-        J.attribute("dur", DurUs);
+        J.attribute("ts", StartUs / 1000);
+        J.attribute("dur", DurUs / 1000);
         J.attribute("name", E.Name);
         if (!E.Detail.empty()) {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
     // Report totals on separate threads of tracing file.
     uint64_t TotalTid = MaxTid + 1;
     for (const NameAndCountAndDurationType &Total : SortedTotals) {
-      auto DurUs = duration_cast<microseconds>(Total.second.second).count();
+      auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
       auto Count = AllCountAndTotalPerName[Total.first].first;
 
       J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
         J.attribute("tid", int64_t(TotalTid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
-        J.attribute("dur", DurUs);
+        J.attribute("dur", DurUs / 1000);
         J.attribute("name", "Total: " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
-          J.attribute("avg ms", int64_t(DurUs / Count / 1000));
+          J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
         });
       });
 
@@ -262,9 +262,9 @@ struct llvm::TimeTraceProfiler {
     // This can be used to combine the profiling data from
     // multiple processes and preserve actual time intervals.
     J.attribute("beginningOfTime",
-                time_point_cast<microseconds>(BeginningOfTime)
+                time_point_cast<nanoseconds>(BeginningOfTime)
                     .time_since_epoch()
-                    .count());
+                    .count()/1000);
 
     J.objectEnd();
   }
@@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler {
   SmallString<0> ThreadName;
   const uint64_t Tid;
 
-  // Minimum time granularity (in microseconds)
+  // Minimum time granularity (in nanoseconds)
   const unsigned TimeTraceGranularity;
 };
 
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index b98d33cce22ecb..d6c6a25ac74234 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -66,7 +66,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
   //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
-  TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
+  TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -239,10 +239,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  //TIMESCOPE_WITH_IDENT(Loc);
-  //TIMESCOPE();
-  //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
-  //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
+  TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
 
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 90c30d391bd8cb..e1f90093341c88 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -821,6 +821,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
+  //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
   int Ret = OFFLOAD_SUCCESS;
   auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
   // process each input.
@@ -913,7 +914,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-
+      TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1403,7 +1404,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                              SmallVector<ptrdiff_t> &TgtOffsets,
                              PrivateArgumentManagerTy &PrivateArgumentManager,
                              AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
 
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
@@ -1456,6 +1456,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
              DPxPTR(HstPtrVal));
           continue;
         }
+        TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret =
@@ -1537,7 +1538,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                             map_var_info_t *ArgNames, void **ArgMappers,
                             PrivateArgumentManagerTy &PrivateArgumentManager,
                             AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
+
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -1564,6 +1565,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
         return Ret;
       });
 
+  
   return OFFLOAD_SUCCESS;
 }
 } // namespace
@@ -1639,7 +1641,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
+    TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
 
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&

>From 5c40b78648128b50d464bbb360aaf4b1c72ff669 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Thu, 21 Sep 2023 14:22:28 +0000
Subject: [PATCH 04/14] test with DevToHost

---
 openmp/libomptarget/src/omptarget.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e1f90093341c88..90b4dc92c299f8 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -914,7 +914,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-      TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
+        std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
+        TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);      
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {

>From c12fd356e5ea3cb585065e433f413f6603db401c Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Fri, 22 Sep 2023 21:48:57 +0000
Subject: [PATCH 05/14] Fixing nanoseconds in totals, adding syncronize
 timings, and adding extra info in kernels and device

---
 llvm/lib/Support/TimeProfiler.cpp     | 24 ++++++++++++------------
 openmp/libomptarget/src/interface.cpp | 15 ++++++++-------
 openmp/libomptarget/src/omptarget.cpp | 17 +++++++++--------
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 64b3ef35be27c4..4446583102a813 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -79,13 +79,13 @@ struct TimeTraceProfilerEntry {
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
   // scopes overruning outer scopes.
-  ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
+  ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const {
     return (time_point_cast<nanoseconds>(Start) -
             time_point_cast<nanoseconds>(StartTime))
         .count();
   }
 
-  ClockType::rep getFlameGraphDurUs() const {
+  ClockType::rep getFlameGraphDurNs() const {
     return (time_point_cast<nanoseconds>(End) -
             time_point_cast<nanoseconds>(Start))
         .count();
@@ -114,9 +114,9 @@ struct llvm::TimeTraceProfiler {
 
     // Check that end times monotonically increase.
     assert((Entries.empty() ||
-            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
-             Entries.back().getFlameGraphStartUs(StartTime) +
-                 Entries.back().getFlameGraphDurUs())) &&
+            (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >=
+             Entries.back().getFlameGraphStartNs(StartTime) +
+                 Entries.back().getFlameGraphDurNs())) &&
            "TimeProfiler scope ended earlier than previous scope");
 
     // Calculate duration at full precision for overall counts.
@@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler {
 
     // Emit all events for the main flame graph.
     auto writeEvent = [&](const auto &E, uint64_t Tid) {
-      auto StartUs = E.getFlameGraphStartUs(StartTime);
-      auto DurUs = E.getFlameGraphDurUs();
+      auto StartNs = E.getFlameGraphStartNs(StartTime);
+      auto DurNs = E.getFlameGraphDurNs();
 
       J.object([&] {
         J.attribute("pid", Pid);
         J.attribute("tid", int64_t(Tid));
         J.attribute("ph", "X");
-        J.attribute("ts", StartUs / 1000);
-        J.attribute("dur", DurUs / 1000);
+        J.attribute("ts", StartNs / 1000);
+        J.attribute("dur", DurNs / 1000);
         J.attribute("name", E.Name);
         if (!E.Detail.empty()) {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
     // Report totals on separate threads of tracing file.
     uint64_t TotalTid = MaxTid + 1;
     for (const NameAndCountAndDurationType &Total : SortedTotals) {
-      auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
+      auto DurNs = duration_cast<nanoseconds>(Total.second.second).count();
       auto Count = AllCountAndTotalPerName[Total.first].first;
 
       J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
         J.attribute("tid", int64_t(TotalTid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
-        J.attribute("dur", DurUs / 1000);
+        J.attribute("dur", DurNs / 1000 );
         J.attribute("name", "Total: " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
-          J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
+          J.attribute("avg us", int64_t(DurNs / Count / 1000));
         });
       });
 
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d6c6a25ac74234..f69d9d5330d2df 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -66,7 +66,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
   //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
-  TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -238,9 +238,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                KernelArgsTy *KernelArgs) {
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-
-  TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
-
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
      DeviceId, DPxPTR(HostPtr));
@@ -265,7 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
-
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
+                                    "NumTeams="+std::to_string(NumTeams)+
+                                    ";NumArgs="+std::to_string(KernelArgs->NumArgs)
+                                    , Loc);
+  
   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
                          KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -292,7 +293,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
+  
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
 
@@ -301,7 +302,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 
   handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
   assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-
+  }
   return OMP_TGT_SUCCESS;
 }
 
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 90b4dc92c299f8..46f8dcb0f087af 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -538,7 +538,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
-
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -821,7 +821,6 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
-  //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
   int Ret = OFFLOAD_SUCCESS;
   auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
   // process each input.
@@ -914,8 +913,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-        std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
-        TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);      
+        TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1456,8 +1454,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
              "variable (" DPxMOD ")\n",
              DPxPTR(HstPtrVal));
           continue;
-        }
-        TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
+        }     
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret =
@@ -1642,8 +1639,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
-
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
+                                    "NumArguments="+std::to_string(KernelArgs.NumArgs)
+                                    +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+                                    +";TripCount="+std::to_string(KernelArgs.Tripcount)
+                                    , Loc);
+    
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
            "Multi dimensional launch not supported yet.");

>From f27edf4922bc3a1cca656cf24655dae15c72b571 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 15:58:50 +0000
Subject: [PATCH 06/14] Some fixes to the profiler

---
 openmp/libomptarget/src/OpenMP/API.cpp | 7 +++++++
 openmp/libomptarget/src/interface.cpp  | 7 ++-----
 openmp/libomptarget/src/omptarget.cpp  | 5 -----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 1769404faf8884..27802d86aff64e 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,6 +83,7 @@ EXTERN int omp_get_initial_device(void) {
 }
 
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
+  TIMESCOPE();
   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -99,6 +100,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
 }
 
 EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
+  TIMESCOPE();
   return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -162,6 +164,11 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
   TIMESCOPE();
+  /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
+                            "NumArguments="+std::to_string(KernelArgs.NumArgs)
+                            +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+                            +";TripCount="+std::to_string(KernelArgs.Tripcount)
+                            , __FUNCTION__);*/
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index f69d9d5330d2df..5eca63fdc39bda 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,6 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -300,8 +299,8 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   if (Rc == OFFLOAD_SUCCESS)
     Rc = AsyncInfo.synchronize();
 
-  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
-  assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
+    handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+    assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
   }
   return OMP_TGT_SUCCESS;
 }
@@ -401,7 +400,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
 
 // Get the current number of components for a user-defined mapper.
 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
-  //TIMESCOPE();
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
   int64_t Size = MapperComponentsPtr->Components.size();
   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -413,7 +411,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
                                         void *Begin, int64_t Size, int64_t Type,
                                         void *Name) {
-  //TIMESCOPE();
   DP("__tgt_push_mapper_component(Handle=" DPxMOD
      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
      ", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 46f8dcb0f087af..e0fd34f24e7e75 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -392,7 +392,6 @@ static int32_t getParentIndex(int64_t Type) {
 
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -419,7 +418,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
      DPxPTR(DevicePtr));
 
@@ -444,7 +442,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
 
 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -471,7 +468,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
 }
 
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
 
   auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,7 +527,6 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
                     bool FromMapper) {
-  //TIMESCOPE_WITH_IDENT(Loc);
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.

>From 2693c8ece70e6a246be3912d84248271584407e9 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 21:06:06 +0000
Subject: [PATCH 07/14] Adding information to some omp api calls

---
 openmp/libomptarget/src/OpenMP/API.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 27802d86aff64e..9150bcd0789e81 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,7 +83,8 @@ EXTERN int omp_get_initial_device(void) {
 }
 
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum)
+                        +";size="+std::to_string(Size));
   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -163,12 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
-  TIMESCOPE();
-  /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
-                            "NumArguments="+std::to_string(KernelArgs.NumArgs)
-                            +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
-                            +";TripCount="+std::to_string(KernelArgs.Tripcount)
-                            , __FUNCTION__);*/
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+                          +";src_dev="+std::to_string(SrcDevice)
+                          +";size="+std::to_string(Length));
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -407,7 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
                                    size_t DstOffset, size_t SrcOffset,
                                    int DstDevice, int SrcDevice,
                                    int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+                          +";src_dev="+std::to_string(SrcDevice)
+                          +";size="+std::to_string(Length));
   DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -436,7 +436,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                        const size_t *DstOffsets, const size_t *SrcOffsets,
                        const size_t *DstDimensions, const size_t *SrcDimensions,
                        int DstDevice, int SrcDevice) {
-  TIMESCOPE();
   DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -495,7 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
     const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
     const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
     int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+                          +";src_dev="+std::to_string(SrcDevice)
+                          +";size="+std::to_string(ElementSize)
+                          +";num_dims="+std::to_string(NumDims));
   DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "

>From 15b301f3ddb9f38ae8576c682a91c9e1a40a58c9 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 12:26:51 +0000
Subject: [PATCH 08/14] Adding information to the LIBOMPTARGET profiler runtime
 kernel and API calls.

---
 openmp/libomptarget/src/interface.cpp | 14 ++++++++------
 openmp/libomptarget/src/omptarget.cpp | 24 +++++++++++++++---------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 5eca63fdc39bda..19fd6b21f540ec 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,9 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+                                  "NumArgs="+
+                                  std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -262,10 +264,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
-                                    "NumTeams="+std::to_string(NumTeams)+
-                                    ";NumArgs="+std::to_string(KernelArgs->NumArgs)
-                                    , Loc);
-  
+                                   "NumTeams="+std::to_string(NumTeams)+
+                                   ";NumArgs="+
+                                   std::to_string(KernelArgs->NumArgs), Loc);
+
   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
                          KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -292,7 +294,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-  
+
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
 
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e0fd34f24e7e75..890625263d946a 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -533,7 +533,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
-    TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev",
+                                     "Size="+
+                                     std::to_string(ArgSizes[I])+
+                                     "B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -908,7 +911,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-        TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
+        TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+
+                                         std::to_string(DataSize)+"B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1449,7 +1453,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
              "variable (" DPxMOD ")\n",
              DPxPTR(HstPtrVal));
           continue;
-        }     
+        }
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret =
@@ -1558,7 +1562,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
         return Ret;
       });
 
-  
   return OFFLOAD_SUCCESS;
 }
 } // namespace
@@ -1635,11 +1638,14 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
     TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
-                                    "NumArguments="+std::to_string(KernelArgs.NumArgs)
-                                    +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
-                                    +";TripCount="+std::to_string(KernelArgs.Tripcount)
-                                    , Loc);
-    
+                                     "NumArguments="+
+                                     std::to_string(KernelArgs.NumArgs)+
+                                     ";NumTeams="+
+                                     std::to_string(KernelArgs.NumTeams[0])+
+                                     ";TripCount="+
+                                     std::to_string(KernelArgs.Tripcount)
+                                     , Loc);
+
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
            "Multi dimensional launch not supported yet.");

>From c24569ad0e59fc8c90f8ce272b8a43eba1ab0b06 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 19:14:01 +0000
Subject: [PATCH 09/14] Fixing format

---
 llvm/lib/Support/TimeProfiler.cpp      | 10 +++++-----
 openmp/libomptarget/src/OpenMP/API.cpp | 24 ++++++++++++------------
 openmp/libomptarget/src/interface.cpp  | 12 ++++++------
 openmp/libomptarget/src/omptarget.cpp  | 24 ++++++++++--------------
 4 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4446583102a813..330a4d93378aff 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -226,7 +226,7 @@ struct llvm::TimeTraceProfiler {
         J.attribute("tid", int64_t(TotalTid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
-        J.attribute("dur", DurNs / 1000 );
+        J.attribute("dur", DurNs / 1000);
         J.attribute("name", "Total: " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
@@ -261,10 +261,10 @@ struct llvm::TimeTraceProfiler {
     // Emit the absolute time when this TimeProfiler started.
     // This can be used to combine the profiling data from
     // multiple processes and preserve actual time intervals.
-    J.attribute("beginningOfTime",
-                time_point_cast<nanoseconds>(BeginningOfTime)
-                    .time_since_epoch()
-                    .count()/1000);
+    J.attribute("beginningOfTime", time_point_cast<nanoseconds>(BeginningOfTime)
+                                           .time_since_epoch()
+                                           .count() /
+                                       1000);
 
     J.objectEnd();
   }
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 9150bcd0789e81..a7b6eac8bcd658 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,8 +83,8 @@ EXTERN int omp_get_initial_device(void) {
 }
 
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
-  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum)
-                        +";size="+std::to_string(Size));
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
+                         ";size=" + std::to_string(Size));
   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -164,9 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
-  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
-                          +";src_dev="+std::to_string(SrcDevice)
-                          +";size="+std::to_string(Length));
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(Length));
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -405,9 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
                                    size_t DstOffset, size_t SrcOffset,
                                    int DstDevice, int SrcDevice,
                                    int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
-                          +";src_dev="+std::to_string(SrcDevice)
-                          +";size="+std::to_string(Length));
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(Length));
   DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -494,10 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
     const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
     const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
     int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
-                          +";src_dev="+std::to_string(SrcDevice)
-                          +";size="+std::to_string(ElementSize)
-                          +";num_dims="+std::to_string(NumDims));
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(ElementSize) +
+                         ";num_dims=" + std::to_string(NumDims));
   DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 19fd6b21f540ec..88323778fe709e 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -66,8 +66,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
-                                  "NumArgs="+
-                                  std::to_string(ArgNum), Loc);
+                                   "NumArgs=" + std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -263,10 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
-  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
-                                   "NumTeams="+std::to_string(NumTeams)+
-                                   ";NumArgs="+
-                                   std::to_string(KernelArgs->NumArgs), Loc);
+  TIMESCOPE_WITH_DETAILS_AND_IDENT(
+      "Runtime target exe",
+      "NumTeams=" + std::to_string(NumTeams) +
+          ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
+      Loc);
 
   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 890625263d946a..a7d55d7ebd5391 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -533,10 +533,8 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
-    TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev",
-                                     "Size="+
-                                     std::to_string(ArgSizes[I])+
-                                     "B", Loc);
+    TIMESCOPE_WITH_DETAILS_AND_IDENT(
+        "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -911,8 +909,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-        TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+
-                                         std::to_string(DataSize)+"B", Loc);
+      TIMESCOPE_WITH_DETAILS_AND_IDENT(
+          "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1637,14 +1635,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
-                                     "NumArguments="+
-                                     std::to_string(KernelArgs.NumArgs)+
-                                     ";NumTeams="+
-                                     std::to_string(KernelArgs.NumTeams[0])+
-                                     ";TripCount="+
-                                     std::to_string(KernelArgs.Tripcount)
-                                     , Loc);
+    TIMESCOPE_WITH_DETAILS_AND_IDENT(
+        "Kernel Target",
+        "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
+            ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
+            ";TripCount=" + std::to_string(KernelArgs.Tripcount),
+        Loc);
 
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&

>From be468b177b6e18001dfe20bad804953919c10617 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 20:51:40 +0000
Subject: [PATCH 10/14] Change of lld/test/MachO/map-file.s: adding : to the
 test check, given that the profile added the colons to make the total more
 clear in the trace

---
 lld/test/MachO/map-file.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index aa9fff9938eb28..279a15b8e33e60 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -89,7 +89,7 @@
 # CHECK-NEXT: 0x[[#%X,BSS]]            0x00000001  [  2] _number
 # CHECK-EMPTY:
 
-# MAPFILE: "name":"Total Write map file"
+# MAPFILE: "name":"Total: Write map file"
 
 # RUN: %lld -demangle -dead_strip -map %t/stripped-map %t/test.o -force_load \
 # RUN:   %t/libfoo.a %t/c-string-literal.o %t/libbaz.dylib -o %t/stripped

>From f95eb17f60284e94de10b80547e80946d9ef2f81 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Tue, 3 Oct 2023 12:10:13 +0000
Subject: [PATCH 11/14] Removing TimeProfiler microsecond to nanosecond change
 from this PR, it will be a separate PR

---
 llvm/lib/Support/TimeProfiler.cpp | 48 +++++++++++++++----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 330a4d93378aff..4d625b3eb5b170 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -30,7 +30,7 @@ namespace {
 
 using std::chrono::duration;
 using std::chrono::duration_cast;
-using std::chrono::nanoseconds;
+using std::chrono::microseconds;
 using std::chrono::steady_clock;
 using std::chrono::system_clock;
 using std::chrono::time_point;
@@ -79,15 +79,15 @@ struct TimeTraceProfilerEntry {
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
   // scopes overruning outer scopes.
-  ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const {
-    return (time_point_cast<nanoseconds>(Start) -
-            time_point_cast<nanoseconds>(StartTime))
+  ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
+    return (time_point_cast<microseconds>(Start) -
+            time_point_cast<microseconds>(StartTime))
         .count();
   }
 
-  ClockType::rep getFlameGraphDurNs() const {
-    return (time_point_cast<nanoseconds>(End) -
-            time_point_cast<nanoseconds>(Start))
+  ClockType::rep getFlameGraphDurUs() const {
+    return (time_point_cast<microseconds>(End) -
+            time_point_cast<microseconds>(Start))
         .count();
   }
 };
@@ -114,16 +114,16 @@ struct llvm::TimeTraceProfiler {
 
     // Check that end times monotonically increase.
     assert((Entries.empty() ||
-            (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >=
-             Entries.back().getFlameGraphStartNs(StartTime) +
-                 Entries.back().getFlameGraphDurNs())) &&
+            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
+             Entries.back().getFlameGraphStartUs(StartTime) +
+                 Entries.back().getFlameGraphDurUs())) &&
            "TimeProfiler scope ended earlier than previous scope");
 
     // Calculate duration at full precision for overall counts.
     DurationType Duration = E.End - E.Start;
 
     // Only include sections longer or equal to TimeTraceGranularity msec.
-    if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity)
+    if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
       Entries.emplace_back(E);
 
     // Track total time taken by each "name", but only the topmost levels of
@@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler {
 
     // Emit all events for the main flame graph.
     auto writeEvent = [&](const auto &E, uint64_t Tid) {
-      auto StartNs = E.getFlameGraphStartNs(StartTime);
-      auto DurNs = E.getFlameGraphDurNs();
+      auto StartUs = E.getFlameGraphStartUs(StartTime);
+      auto DurUs = E.getFlameGraphDurUs();
 
       J.object([&] {
         J.attribute("pid", Pid);
         J.attribute("tid", int64_t(Tid));
         J.attribute("ph", "X");
-        J.attribute("ts", StartNs / 1000);
-        J.attribute("dur", DurNs / 1000);
+        J.attribute("ts", StartUs);
+        J.attribute("dur", DurUs);
         J.attribute("name", E.Name);
         if (!E.Detail.empty()) {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
     // Report totals on separate threads of tracing file.
     uint64_t TotalTid = MaxTid + 1;
     for (const NameAndCountAndDurationType &Total : SortedTotals) {
-      auto DurNs = duration_cast<nanoseconds>(Total.second.second).count();
+      auto DurUs = duration_cast<microseconds>(Total.second.second).count();
       auto Count = AllCountAndTotalPerName[Total.first].first;
 
       J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
         J.attribute("tid", int64_t(TotalTid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
-        J.attribute("dur", DurNs / 1000);
-        J.attribute("name", "Total: " + Total.first);
+        J.attribute("dur", DurUs);
+        J.attribute("name", "Total " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
-          J.attribute("avg us", int64_t(DurNs / Count / 1000));
+          J.attribute("avg ms", int64_t(DurUs / Count / 1000));
         });
       });
 
@@ -261,10 +261,10 @@ struct llvm::TimeTraceProfiler {
     // Emit the absolute time when this TimeProfiler started.
     // This can be used to combine the profiling data from
     // multiple processes and preserve actual time intervals.
-    J.attribute("beginningOfTime", time_point_cast<nanoseconds>(BeginningOfTime)
-                                           .time_since_epoch()
-                                           .count() /
-                                       1000);
+    J.attribute("beginningOfTime",
+                time_point_cast<microseconds>(BeginningOfTime)
+                    .time_since_epoch()
+                    .count());
 
     J.objectEnd();
   }
@@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler {
   SmallString<0> ThreadName;
   const uint64_t Tid;
 
-  // Minimum time granularity (in nanoseconds)
+  // Minimum time granularity (in microseconds)
   const unsigned TimeTraceGranularity;
 };
 

>From e78708d9428d8cd1481a9105ca565c9838ea02ef Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Tue, 3 Oct 2023 14:09:18 +0000
Subject: [PATCH 12/14] restoring lld/test/MachO/map-file.s test, because the
 change was necesary for the changes to the llvm/lib/Support/TimeProfiler.cpp,
 which was removed from this PR

---
 lld/test/MachO/map-file.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index 279a15b8e33e60..aa9fff9938eb28 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -89,7 +89,7 @@
 # CHECK-NEXT: 0x[[#%X,BSS]]            0x00000001  [  2] _number
 # CHECK-EMPTY:
 
-# MAPFILE: "name":"Total: Write map file"
+# MAPFILE: "name":"Total Write map file"
 
 # RUN: %lld -demangle -dead_strip -map %t/stripped-map %t/test.o -force_load \
 # RUN:   %t/libfoo.a %t/c-string-literal.o %t/libbaz.dylib -o %t/stripped

>From bf5d83c3f879a649638bab2a4c75241e23bb6dab Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Fri, 27 Oct 2023 20:58:26 +0000
Subject: [PATCH 13/14] Adding kernel function name to the slice name

---
 openmp/libomptarget/src/interface.cpp | 4 ++--
 openmp/libomptarget/src/omptarget.cpp | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 88323778fe709e..86c95416c41db4 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime Data Copy:",
                                    "NumArgs=" + std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -263,7 +263,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
   TIMESCOPE_WITH_DETAILS_AND_IDENT(
-      "Runtime target exe",
+      "Runtime target exe:",
       "NumTeams=" + std::to_string(NumTeams) +
           ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
       Loc);
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index a7d55d7ebd5391..77a648de567a87 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -534,7 +534,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
     TIMESCOPE_WITH_DETAILS_AND_IDENT(
-        "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
+        "HostToDev:", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -910,7 +910,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
       TIMESCOPE_WITH_DETAILS_AND_IDENT(
-          "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
+          "DevToHost:", "Size=" + std::to_string(DataSize) + "B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1636,7 +1636,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
     TIMESCOPE_WITH_DETAILS_AND_IDENT(
-        "Kernel Target",
+        "Kernel Target:",
         "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
             ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
             ";TripCount=" + std::to_string(KernelArgs.Tripcount),

>From d56e4b27f606bcb8cdf6b8ec5af20b80b8ad0d5b Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Fri, 22 Dec 2023 19:06:04 +0000
Subject: [PATCH 14/14] Rebase fix features

---
 openmp/libomptarget/include/Shared/Profile.h | 12 ++++++++++++
 openmp/libomptarget/src/interface.cpp        | 11 ++++++-----
 openmp/libomptarget/src/omptarget.cpp        |  6 +++---
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/openmp/libomptarget/include/Shared/Profile.h b/openmp/libomptarget/include/Shared/Profile.h
index 19ca0cf2275186..7e580988a39baf 100644
--- a/openmp/libomptarget/include/Shared/Profile.h
+++ b/openmp/libomptarget/include/Shared/Profile.h
@@ -97,4 +97,16 @@ class Profiler {
   std::string RTM = RegionTypeMsg;                                             \
   llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
 
+/// Time spend in the current scope, assigned to the regionType
+/// with details from runtime
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)        \
+  SourceInfo SI(IDENT);                                                        \
+  std::string ProfileLocation = SI.getProfileLocation();                       \
+  llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+
+/// Time spend in the current scope, assigned to the function name and source
+/// with details
+#define TIMESCOPE_WITH_DETAILS(Details)                                        \
+  llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
+
 #endif // OMPTARGET_SHARED_PROFILE_H
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 86c95416c41db4..61d9db17f51006 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -65,7 +65,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime Data Copy:",
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
                                    "NumArgs=" + std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -263,7 +263,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
   TIMESCOPE_WITH_DETAILS_AND_IDENT(
-      "Runtime target exe:",
+      "Runtime: target exe",
       "NumTeams=" + std::to_string(NumTeams) +
           ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
       Loc);
@@ -297,9 +297,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
-
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
+  { // required to show syncronization
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
+    if (Rc == OFFLOAD_SUCCESS)
+      Rc = AsyncInfo.synchronize();
 
     handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
     assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 77a648de567a87..a7d55d7ebd5391 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -534,7 +534,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
     TIMESCOPE_WITH_DETAILS_AND_IDENT(
-        "HostToDev:", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
+        "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -910,7 +910,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
       TIMESCOPE_WITH_DETAILS_AND_IDENT(
-          "DevToHost:", "Size=" + std::to_string(DataSize) + "B", Loc);
+          "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1636,7 +1636,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
     TIMESCOPE_WITH_DETAILS_AND_IDENT(
-        "Kernel Target:",
+        "Kernel Target",
         "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
             ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
             ";TripCount=" + std::to_string(KernelArgs.Tripcount),



More information about the Openmp-commits mailing list