[clang-tools-extra] Improve omp offload profiler (PR #68016)

via cfe-commits cfe-commits at lists.llvm.org
Mon Oct 2 11:34:34 PDT 2023


https://github.com/fel-cab created https://github.com/llvm/llvm-project/pull/68016

Summary:
Adding information to the LIBOMPTARGET profiler runtime kernel and API calls.

Key changes:
* Adding information to runtime calls for better understanding of how the application
  is executing. For example teams requested by the user, size of memory transfers.
* Profile timer was changed from 'us' to 'ns', since 'us' was too coarse-grain
  to register some important details like key kernel duration
* Removed non API or Runtime calls, to reduce complexity of profile for application
  developers.


>From dd44de067c26ba94b6561c5ed7fa4a5d812a3d1a Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Mon, 18 Sep 2023 12:07:12 +0000
Subject: [PATCH 1/8] testing Profiler features

---
 openmp/libomptarget/src/interface.cpp | 5 ++++-
 openmp/libomptarget/src/private.h     | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 5f21b16b3fbfb1e..f64e1e268a3952e 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -252,7 +252,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_IDENT(Loc);
+  //TIMESCOPE_WITH_IDENT(Loc);
+  TIMESCOPE();
+  //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
+  //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
 
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index cbce15b63a3eba2..dc6cd3944233955 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -433,7 +433,8 @@ class ExponentialBackoff {
   SourceInfo SI(IDENT);                                                        \
   std::string ProfileLocation = SI.getProfileLocation();                       \
   std::string RTM = RegionTypeMsg;                                             \
-  llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+  llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
+  //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
 #else
 #define TIMESCOPE()
 #define TIMESCOPE_WITH_IDENT(IDENT)

>From 92586bca6364100c7511ad38a30f41b0f86dea9c Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 12:02:53 +0000
Subject: [PATCH 2/8] Improve Profiler 1

---
 llvm/lib/Support/TimeProfiler.cpp     |  2 +-
 openmp/libomptarget/src/interface.cpp | 17 +++++++++--------
 openmp/libomptarget/src/omptarget.cpp | 10 +++++-----
 openmp/libomptarget/src/private.h     |  5 +++--
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4d625b3eb5b1709..e1458116f64ab47 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -227,7 +227,7 @@ struct llvm::TimeTraceProfiler {
         J.attribute("ph", "X");
         J.attribute("ts", 0);
         J.attribute("dur", DurUs);
-        J.attribute("name", "Total " + Total.first);
+        J.attribute("name", "Total: " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
           J.attribute("avg ms", int64_t(DurUs / Count / 1000));
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index f64e1e268a3952e..b8892cbe689107f 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,14 @@ using namespace llvm::omp::target::ompt;
 ////////////////////////////////////////////////////////////////////////////////
 /// adds requires flags
 EXTERN void __tgt_register_requires(int64_t Flags) {
-  TIMESCOPE();
+  //TIMESCOPE();
   PM->RTLs.registerRequires(Flags);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
+  //TIMESCOPE();
   if (PM->maybeDelayRegisterLib(Desc))
     return;
 
@@ -61,7 +61,7 @@ EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); }
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
+  //TIMESCOPE();
   PM->RTLs.unregisterLib(Desc);
   for (auto &RTL : PM->RTLs.UsedRTLs) {
     if (RTL->unregister_lib) {
@@ -82,7 +82,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+  //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+  TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -253,9 +254,9 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
 
   //TIMESCOPE_WITH_IDENT(Loc);
-  TIMESCOPE();
+  //TIMESCOPE();
   //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
-  //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc);
+  //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
 
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
@@ -411,7 +412,7 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
 
 // Get the current number of components for a user-defined mapper.
 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
-  TIMESCOPE();
+  //TIMESCOPE();
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
   int64_t Size = MapperComponentsPtr->Components.size();
   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -423,7 +424,7 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
                                         void *Begin, int64_t Size, int64_t Type,
                                         void *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("__tgt_push_mapper_component(Handle=" DPxMOD
      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
      ", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 40419e448942608..3754f63909dac9c 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -398,7 +398,7 @@ static int32_t getParentIndex(int64_t Type) {
 
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -427,7 +427,7 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
      DPxPTR(DevicePtr));
 
@@ -453,7 +453,7 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
 
 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -493,7 +493,7 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
 }
 
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
-  TIMESCOPE();
+  //TIMESCOPE();
   DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
 
   DeviceTy *DevicePtr = nullptr;
@@ -572,7 +572,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
                     bool FromMapper) {
-  TIMESCOPE_WITH_IDENT(Loc);
+  //TIMESCOPE_WITH_IDENT(Loc);
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index dc6cd3944233955..b1ada09d64c7a55 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -433,8 +433,9 @@ class ExponentialBackoff {
   SourceInfo SI(IDENT);                                                        \
   std::string ProfileLocation = SI.getProfileLocation();                       \
   std::string RTM = RegionTypeMsg;                                             \
-  llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
-  //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+  llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+  //llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
+  
 #else
 #define TIMESCOPE()
 #define TIMESCOPE_WITH_IDENT(IDENT)

>From f9167dc8fef277ac1aa53e2e95bade3f0b727df1 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 19 Sep 2023 21:33:24 +0000
Subject: [PATCH 3/8] Changed profiling to work in nanoseconds. Made Profiling
 calls for runtime calls and different ones for kernel lunches and memory
 transfers.

---
 llvm/lib/Support/TimeProfiler.cpp     | 28 +++++++++++++--------------
 openmp/libomptarget/src/interface.cpp |  7 ++-----
 openmp/libomptarget/src/omptarget.cpp | 11 +++++++----
 openmp/libomptarget/src/private.h     |  6 +++---
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index e1458116f64ab47..64b3ef35be27c42 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -30,7 +30,7 @@ namespace {
 
 using std::chrono::duration;
 using std::chrono::duration_cast;
-using std::chrono::microseconds;
+using std::chrono::nanoseconds;
 using std::chrono::steady_clock;
 using std::chrono::system_clock;
 using std::chrono::time_point;
@@ -80,14 +80,14 @@ struct TimeTraceProfilerEntry {
   // rather than casting duration. This avoids truncation issues causing inner
   // scopes overruning outer scopes.
   ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
-    return (time_point_cast<microseconds>(Start) -
-            time_point_cast<microseconds>(StartTime))
+    return (time_point_cast<nanoseconds>(Start) -
+            time_point_cast<nanoseconds>(StartTime))
         .count();
   }
 
   ClockType::rep getFlameGraphDurUs() const {
-    return (time_point_cast<microseconds>(End) -
-            time_point_cast<microseconds>(Start))
+    return (time_point_cast<nanoseconds>(End) -
+            time_point_cast<nanoseconds>(Start))
         .count();
   }
 };
@@ -123,7 +123,7 @@ struct llvm::TimeTraceProfiler {
     DurationType Duration = E.End - E.Start;
 
     // Only include sections longer or equal to TimeTraceGranularity msec.
-    if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity)
+    if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity)
       Entries.emplace_back(E);
 
     // Track total time taken by each "name", but only the topmost levels of
@@ -169,8 +169,8 @@ struct llvm::TimeTraceProfiler {
         J.attribute("pid", Pid);
         J.attribute("tid", int64_t(Tid));
         J.attribute("ph", "X");
-        J.attribute("ts", StartUs);
-        J.attribute("dur", DurUs);
+        J.attribute("ts", StartUs / 1000);
+        J.attribute("dur", DurUs / 1000);
         J.attribute("name", E.Name);
         if (!E.Detail.empty()) {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
     // Report totals on separate threads of tracing file.
     uint64_t TotalTid = MaxTid + 1;
     for (const NameAndCountAndDurationType &Total : SortedTotals) {
-      auto DurUs = duration_cast<microseconds>(Total.second.second).count();
+      auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
       auto Count = AllCountAndTotalPerName[Total.first].first;
 
       J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
         J.attribute("tid", int64_t(TotalTid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
-        J.attribute("dur", DurUs);
+        J.attribute("dur", DurUs / 1000);
         J.attribute("name", "Total: " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
-          J.attribute("avg ms", int64_t(DurUs / Count / 1000));
+          J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
         });
       });
 
@@ -262,9 +262,9 @@ struct llvm::TimeTraceProfiler {
     // This can be used to combine the profiling data from
     // multiple processes and preserve actual time intervals.
     J.attribute("beginningOfTime",
-                time_point_cast<microseconds>(BeginningOfTime)
+                time_point_cast<nanoseconds>(BeginningOfTime)
                     .time_since_epoch()
-                    .count());
+                    .count()/1000);
 
     J.objectEnd();
   }
@@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler {
   SmallString<0> ThreadName;
   const uint64_t Tid;
 
-  // Minimum time granularity (in microseconds)
+  // Minimum time granularity (in nanoseconds)
   const unsigned TimeTraceGranularity;
 };
 
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index b8892cbe689107f..d4ee246f84449f1 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -83,7 +83,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
   //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
-  TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc);
+  TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -253,10 +253,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  //TIMESCOPE_WITH_IDENT(Loc);
-  //TIMESCOPE();
-  //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc);
-  //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
+  TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
 
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 3754f63909dac9c..ad966e7e1c47544 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -863,6 +863,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
+  //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
   int Ret = OFFLOAD_SUCCESS;
   auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
   // process each input.
@@ -955,7 +956,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-
+      TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1445,7 +1446,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                              SmallVector<ptrdiff_t> &TgtOffsets,
                              PrivateArgumentManagerTy &PrivateArgumentManager,
                              AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
+  //TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
   DeviceTy &Device = *PM->Devices[DeviceId];
   int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
@@ -1493,6 +1494,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
              DPxPTR(HstPtrVal));
           continue;
         }
+        TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
@@ -1572,7 +1574,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                             map_var_info_t *ArgNames, void **ArgMappers,
                             PrivateArgumentManagerTy &PrivateArgumentManager,
                             AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
+  //TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
   DeviceTy &Device = *PM->Devices[DeviceId];
 
   // Move data from device.
@@ -1597,6 +1599,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
         return Ret;
       });
 
+  
   return OFFLOAD_SUCCESS;
 }
 } // namespace
@@ -1672,7 +1675,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
+    TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
 
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index b1ada09d64c7a55..f0591cd17b0fd15 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -432,10 +432,10 @@ class ExponentialBackoff {
 #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT)                     \
   SourceInfo SI(IDENT);                                                        \
   std::string ProfileLocation = SI.getProfileLocation();                       \
+  std::string ProfileName = SI.getName();                       \
   std::string RTM = RegionTypeMsg;                                             \
-  llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
-  //llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM)
-  
+  llvm::TimeTraceScope TimeScope(ProfileName, ProfileLocation + RTM)
+  //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
 #else
 #define TIMESCOPE()
 #define TIMESCOPE_WITH_IDENT(IDENT)

>From c82ce52f244d218752fea2dcc1f347fc589cd016 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Thu, 21 Sep 2023 14:22:28 +0000
Subject: [PATCH 4/8] test with DevToHost

---
 openmp/libomptarget/src/omptarget.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index ad966e7e1c47544..e113942375ef9c6 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -956,7 +956,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-      TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc);
+        std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
+        TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);      
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {

>From 448f0e77b6c824de73cbd9ae34d4c59b02e7e441 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Fri, 22 Sep 2023 21:48:57 +0000
Subject: [PATCH 5/8] Fixing nanoseconds in totals, adding syncronize timings,
 and adding extra info in kernels and device

---
 llvm/lib/Support/TimeProfiler.cpp     | 24 ++++++++++++------------
 openmp/libomptarget/src/interface.cpp | 18 ++++++++++--------
 openmp/libomptarget/src/omptarget.cpp | 19 +++++++++----------
 openmp/libomptarget/src/private.h     | 10 +++++++---
 4 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 64b3ef35be27c42..4446583102a8133 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -79,13 +79,13 @@ struct TimeTraceProfilerEntry {
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
   // scopes overruning outer scopes.
-  ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const {
+  ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const {
     return (time_point_cast<nanoseconds>(Start) -
             time_point_cast<nanoseconds>(StartTime))
         .count();
   }
 
-  ClockType::rep getFlameGraphDurUs() const {
+  ClockType::rep getFlameGraphDurNs() const {
     return (time_point_cast<nanoseconds>(End) -
             time_point_cast<nanoseconds>(Start))
         .count();
@@ -114,9 +114,9 @@ struct llvm::TimeTraceProfiler {
 
     // Check that end times monotonically increase.
     assert((Entries.empty() ||
-            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
-             Entries.back().getFlameGraphStartUs(StartTime) +
-                 Entries.back().getFlameGraphDurUs())) &&
+            (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >=
+             Entries.back().getFlameGraphStartNs(StartTime) +
+                 Entries.back().getFlameGraphDurNs())) &&
            "TimeProfiler scope ended earlier than previous scope");
 
     // Calculate duration at full precision for overall counts.
@@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler {
 
     // Emit all events for the main flame graph.
     auto writeEvent = [&](const auto &E, uint64_t Tid) {
-      auto StartUs = E.getFlameGraphStartUs(StartTime);
-      auto DurUs = E.getFlameGraphDurUs();
+      auto StartNs = E.getFlameGraphStartNs(StartTime);
+      auto DurNs = E.getFlameGraphDurNs();
 
       J.object([&] {
         J.attribute("pid", Pid);
         J.attribute("tid", int64_t(Tid));
         J.attribute("ph", "X");
-        J.attribute("ts", StartUs / 1000);
-        J.attribute("dur", DurUs / 1000);
+        J.attribute("ts", StartNs / 1000);
+        J.attribute("dur", DurNs / 1000);
         J.attribute("name", E.Name);
         if (!E.Detail.empty()) {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
@@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler {
     // Report totals on separate threads of tracing file.
     uint64_t TotalTid = MaxTid + 1;
     for (const NameAndCountAndDurationType &Total : SortedTotals) {
-      auto DurUs = duration_cast<nanoseconds>(Total.second.second).count();
+      auto DurNs = duration_cast<nanoseconds>(Total.second.second).count();
       auto Count = AllCountAndTotalPerName[Total.first].first;
 
       J.object([&] {
@@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler {
         J.attribute("tid", int64_t(TotalTid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
-        J.attribute("dur", DurUs / 1000);
+        J.attribute("dur", DurNs / 1000 );
         J.attribute("name", "Total: " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
-          J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000));
+          J.attribute("avg us", int64_t(DurNs / Count / 1000));
         });
       });
 
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d4ee246f84449f1..bed9b1e40db455b 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -83,7 +83,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
   //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
-  TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc);
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -252,9 +252,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                KernelArgsTy *KernelArgs) {
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-
-  TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc);
-
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
      DeviceId, DPxPTR(HostPtr));
@@ -279,7 +276,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
-
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
+                                    "NumTeams="+std::to_string(NumTeams)+
+                                    ";NumArgs="+std::to_string(KernelArgs->NumArgs)
+                                    , Loc);
+  
   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
                          KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -303,16 +304,17 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-
+  
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
-
+  {
+    TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc);
   if (Rc == OFFLOAD_SUCCESS)
     Rc = AsyncInfo.synchronize();
 
   handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
   assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-
+  }
   return OMP_TGT_SUCCESS;
 }
 
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e113942375ef9c6..5f6168b0bd2fca0 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -579,7 +579,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
-
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -863,7 +863,6 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                   void **ArgBases, void **Args, int64_t *ArgSizes,
                   int64_t *ArgTypes, map_var_info_t *ArgNames,
                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
-  //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc);
   int Ret = OFFLOAD_SUCCESS;
   auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
   // process each input.
@@ -956,8 +955,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-        std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B";
-        TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc);      
+        TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1447,7 +1445,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                              SmallVector<ptrdiff_t> &TgtOffsets,
                              PrivateArgumentManagerTy &PrivateArgumentManager,
                              AsyncInfoTy &AsyncInfo) {
-  //TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
   DeviceTy &Device = *PM->Devices[DeviceId];
   int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
@@ -1494,8 +1491,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
              "variable (" DPxMOD ")\n",
              DPxPTR(HstPtrVal));
           continue;
-        }
-        TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc);
+        }     
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
@@ -1575,7 +1571,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                             map_var_info_t *ArgNames, void **ArgMappers,
                             PrivateArgumentManagerTy &PrivateArgumentManager,
                             AsyncInfoTy &AsyncInfo) {
-  //TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
   DeviceTy &Device = *PM->Devices[DeviceId];
 
   // Move data from device.
@@ -1676,8 +1671,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc);
-
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
+                                    "NumArguments="+std::to_string(KernelArgs.NumArgs)
+                                    +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+                                    +";TripCount="+std::to_string(KernelArgs.Tripcount)
+                                    , Loc);
+    
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
            "Multi dimensional launch not supported yet.");
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index f0591cd17b0fd15..4bc1db79de3f2b7 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -432,14 +432,18 @@ class ExponentialBackoff {
 #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT)                     \
   SourceInfo SI(IDENT);                                                        \
   std::string ProfileLocation = SI.getProfileLocation();                       \
-  std::string ProfileName = SI.getName();                       \
+  std::string ProfileName = SI.getName();                                      \
   std::string RTM = RegionTypeMsg;                                             \
-  llvm::TimeTraceScope TimeScope(ProfileName, ProfileLocation + RTM)
+  llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
   //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)        \
+  SourceInfo SI(IDENT);                                                        \
+  std::string ProfileLocation = SI.getProfileLocation();                       \
+  llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
 #else
 #define TIMESCOPE()
 #define TIMESCOPE_WITH_IDENT(IDENT)
 #define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT)
 #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT)
-
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)
 #endif

>From c8bb24e807324a6a42b50076e5a3d2159f1d6d74 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 15:58:50 +0000
Subject: [PATCH 6/8] Some fixes to the profiler

---
 openmp/libomptarget/src/api.cpp       |  7 +++++++
 openmp/libomptarget/src/interface.cpp | 16 +++++-----------
 openmp/libomptarget/src/omptarget.cpp |  5 -----
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 942df8fdb94d660..f628a64c5b69fa4 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -50,6 +50,7 @@ EXTERN int omp_get_initial_device(void) {
 }
 
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
+  TIMESCOPE();
   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -66,6 +67,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
 }
 
 EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
+  TIMESCOPE();
   return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -134,6 +136,11 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
   TIMESCOPE();
+  /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
+                            "NumArguments="+std::to_string(KernelArgs.NumArgs)
+                            +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
+                            +";TripCount="+std::to_string(KernelArgs.Tripcount)
+                            , __FUNCTION__);*/
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index bed9b1e40db455b..61a340ccf8d1b10 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
 ////////////////////////////////////////////////////////////////////////////////
 /// adds requires flags
 EXTERN void __tgt_register_requires(int64_t Flags) {
-  //TIMESCOPE();
   PM->RTLs.registerRequires(Flags);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
-  //TIMESCOPE();
   if (PM->maybeDelayRegisterLib(Desc))
     return;
 
@@ -61,7 +59,6 @@ EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); }
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
-  //TIMESCOPE();
   PM->RTLs.unregisterLib(Desc);
   for (auto &RTL : PM->RTLs.UsedRTLs) {
     if (RTL->unregister_lib) {
@@ -82,7 +79,6 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
@@ -307,13 +303,13 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
-  {
+  { //required to show syncronization
     TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc);
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
+    if (Rc == OFFLOAD_SUCCESS)
+      Rc = AsyncInfo.synchronize();
 
-  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
-  assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
+    handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+    assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
   }
   return OMP_TGT_SUCCESS;
 }
@@ -411,7 +407,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
 
 // Get the current number of components for a user-defined mapper.
 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
-  //TIMESCOPE();
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
   int64_t Size = MapperComponentsPtr->Components.size();
   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -423,7 +418,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
                                         void *Begin, int64_t Size, int64_t Type,
                                         void *Name) {
-  //TIMESCOPE();
   DP("__tgt_push_mapper_component(Handle=" DPxMOD
      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
      ", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 5f6168b0bd2fca0..450f34894fb56b4 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -398,7 +398,6 @@ static int32_t getParentIndex(int64_t Type) {
 
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -427,7 +426,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
      DPxPTR(DevicePtr));
 
@@ -453,7 +451,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
 
 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -493,7 +490,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
 }
 
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
-  //TIMESCOPE();
   DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
 
   DeviceTy *DevicePtr = nullptr;
@@ -572,7 +568,6 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
                     bool FromMapper) {
-  //TIMESCOPE_WITH_IDENT(Loc);
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.

>From da71cf17918c56e6a64c1e966dbb5d0dd79d0ed9 Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Date: Tue, 26 Sep 2023 21:06:06 +0000
Subject: [PATCH 7/8] Adding information to some omp api calls

---
 openmp/libomptarget/src/api.cpp   | 22 ++++++++++++----------
 openmp/libomptarget/src/private.h |  7 +++++--
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index f628a64c5b69fa4..5dd918808492997 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -50,7 +50,8 @@ EXTERN int omp_get_initial_device(void) {
 }
 
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum)
+                        +";size="+std::to_string(Size));
   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -135,12 +136,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
-  TIMESCOPE();
-  /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy",
-                            "NumArguments="+std::to_string(KernelArgs.NumArgs)
-                            +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
-                            +";TripCount="+std::to_string(KernelArgs.Tripcount)
-                            , __FUNCTION__);*/
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+                          +";src_dev="+std::to_string(SrcDevice)
+                          +";size="+std::to_string(Length));
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -293,7 +291,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
                                    size_t DstOffset, size_t SrcOffset,
                                    int DstDevice, int SrcDevice,
                                    int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+                          +";src_dev="+std::to_string(SrcDevice)
+                          +";size="+std::to_string(Length));
   DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -321,7 +321,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                        const size_t *DstOffsets, const size_t *SrcOffsets,
                        const size_t *DstDimensions, const size_t *SrcDimensions,
                        int DstDevice, int SrcDevice) {
-  TIMESCOPE();
   DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -380,7 +379,10 @@ EXTERN int omp_target_memcpy_rect_async(
     const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
     const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
     int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice)
+                          +";src_dev="+std::to_string(SrcDevice)
+                          +";size="+std::to_string(ElementSize)
+                          +";num_dims="+std::to_string(NumDims));
   DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index 4bc1db79de3f2b7..c8d07138b180d17 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -434,16 +434,19 @@ class ExponentialBackoff {
   std::string ProfileLocation = SI.getProfileLocation();                       \
   std::string ProfileName = SI.getName();                                      \
   std::string RTM = RegionTypeMsg;                                             \
-  llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
-  //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+  llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+  //llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM)
 #define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)        \
   SourceInfo SI(IDENT);                                                        \
   std::string ProfileLocation = SI.getProfileLocation();                       \
   llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+#define TIMESCOPE_WITH_DETAILS(Details)                                        \
+  llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
 #else
 #define TIMESCOPE()
 #define TIMESCOPE_WITH_IDENT(IDENT)
 #define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT)
 #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT)
 #define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)
+#define TIMESCOPE_WITH_DETAILS(Details)
 #endif

>From f273bbcc66f361fe9cc03d8597ee886122b5e235 Mon Sep 17 00:00:00 2001
From: fel-cab <fel-cab at github.com>
Date: Mon, 2 Oct 2023 12:26:51 +0000
Subject: [PATCH 8/8] Adding information to the LIBOMPTARGET profiler runtime
 kernel and API calls.

---
 openmp/libomptarget/src/interface.cpp | 14 ++++++++------
 openmp/libomptarget/src/omptarget.cpp | 24 +++++++++++++++---------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 61a340ccf8d1b10..99a7abc7e0bcee9 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -79,7 +79,9 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc);
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+                                  "NumArgs="+
+                                  std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -273,10 +275,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe",
-                                    "NumTeams="+std::to_string(NumTeams)+
-                                    ";NumArgs="+std::to_string(KernelArgs->NumArgs)
-                                    , Loc);
-  
+                                   "NumTeams="+std::to_string(NumTeams)+
+                                   ";NumArgs="+
+                                   std::to_string(KernelArgs->NumArgs), Loc);
+
   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
                          KernelArgs->ArgSizes, KernelArgs->ArgTypes,
@@ -300,7 +302,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
                     RegionInterface.getCallbacks<ompt_target>(), DeviceId,
                     /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
-  
+
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
   { //required to show syncronization
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 450f34894fb56b4..b5a2dfc68569081 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -574,7 +574,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
-    TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc);
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev",
+                                     "Size="+
+                                     std::to_string(ArgSizes[I])+
+                                     "B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -950,7 +953,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-        TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc);
+        TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+
+                                         std::to_string(DataSize)+"B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1486,7 +1490,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
              "variable (" DPxMOD ")\n",
              DPxPTR(HstPtrVal));
           continue;
-        }     
+        }
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
@@ -1590,7 +1594,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
         return Ret;
       });
 
-  
   return OFFLOAD_SUCCESS;
 }
 } // namespace
@@ -1667,11 +1670,14 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
     TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target",
-                                    "NumArguments="+std::to_string(KernelArgs.NumArgs)
-                                    +";NumTeams="+std::to_string(KernelArgs.NumTeams[0])
-                                    +";TripCount="+std::to_string(KernelArgs.Tripcount)
-                                    , Loc);
-    
+                                     "NumArguments="+
+                                     std::to_string(KernelArgs.NumArgs)+
+                                     ";NumTeams="+
+                                     std::to_string(KernelArgs.NumTeams[0])+
+                                     ";TripCount="+
+                                     std::to_string(KernelArgs.Tripcount)
+                                     , Loc);
+
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
            "Multi dimensional launch not supported yet.");



More information about the cfe-commits mailing list