[Openmp-commits] [openmp] 74f9174 - [OpenMP] Use function tracing RAII for runtime functions.

Joseph Huber via Openmp-commits openmp-commits at lists.llvm.org
Fri Oct 29 11:57:22 PDT 2021


Author: Joseph Huber
Date: 2021-10-29T14:57:11-04:00
New Revision: 74f91741b66b9327fdbae6411286672ec088c3a3

URL: https://github.com/llvm/llvm-project/commit/74f91741b66b9327fdbae6411286672ec088c3a3
DIFF: https://github.com/llvm/llvm-project/commit/74f91741b66b9327fdbae6411286672ec088c3a3.diff

LOG: [OpenMP] Use function tracing RAII for runtime functions.

This patch adds support for using function tracing features to track the
executino of runtime functions in the device runtime library. This is
enabled by first compiling the new runtime with
`-fopenmp-target-debug=3` and running with
`LIBOMPTARGET_DEVICE_RTL_DEBUG=3`. The output only tracks team 0 and
thread 0 so there isn't much output when using a generic region.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D112002

Added: 
    

Modified: 
    openmp/libomptarget/DeviceRTL/include/Debug.h
    openmp/libomptarget/DeviceRTL/src/Debug.cpp
    openmp/libomptarget/DeviceRTL/src/Kernel.cpp
    openmp/libomptarget/DeviceRTL/src/Mapping.cpp
    openmp/libomptarget/DeviceRTL/src/Misc.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
    openmp/libomptarget/DeviceRTL/src/Reduction.cpp
    openmp/libomptarget/DeviceRTL/src/State.cpp
    openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
    openmp/libomptarget/DeviceRTL/src/Tasking.cpp
    openmp/libomptarget/DeviceRTL/src/Utils.cpp
    openmp/libomptarget/DeviceRTL/src/Workshare.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h
index e8e9078ee495c..6aa801d80bd7e 100644
--- a/openmp/libomptarget/DeviceRTL/include/Debug.h
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -49,13 +49,13 @@ int printf(const char *format, ...);
 /// Enter a debugging scope for performing function traces. Enabled with
 /// FunctionTracting set in the debug kind.
 #define FunctionTracingRAII()                                                  \
-  DebugEntryRAII Entry(__LINE__, __PRETTY_FUNCTION__);
+  DebugEntryRAII Entry(__FILE__, __LINE__, __PRETTY_FUNCTION__);
 
 /// An RAII class for handling entries to debug locations. The current location
 /// and function will be printed on entry. Nested levels increase the
 /// indentation shown in the debugging output.
 struct DebugEntryRAII {
-  DebugEntryRAII(const unsigned Line, const char *Function);
+  DebugEntryRAII(const char *File, const unsigned Line, const char *Function);
   ~DebugEntryRAII();
 };
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
index 6fa684339c7ab..2f0e608832a71 100644
--- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -12,6 +12,7 @@
 
 #include "Debug.h"
 #include "Configuration.h"
+#include "Interface.h"
 #include "Mapping.h"
 #include "Types.h"
 
@@ -41,14 +42,15 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
 static uint32_t Level = 0;
 #pragma omp allocate(Level) allocator(omp_pteam_mem_alloc)
 
-DebugEntryRAII::DebugEntryRAII(const unsigned Line, const char *Function) {
+DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
+                               const char *Function) {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
-      mapping::getThreadIdInBlock() == 0) {
+      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
 
     for (int I = 0; I < Level; ++I)
       PRINTF("%s", "  ");
 
-    PRINTF("Line %u: Thread %u Entering %s:%u\n", Line,
+    PRINTF("%s:%u: Thread %u Entering %s\n", File, Line,
            mapping::getThreadIdInBlock(), Function);
     Level++;
   }
@@ -56,7 +58,7 @@ DebugEntryRAII::DebugEntryRAII(const unsigned Line, const char *Function) {
 
 DebugEntryRAII::~DebugEntryRAII() {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
-      mapping::getThreadIdInBlock() == 0)
+      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0)
     Level--;
 }
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index d47fa03c367ec..94bf432acb9a9 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -30,6 +30,7 @@ static void inititializeRuntime(bool IsSPMD) {
 
 /// Simple generic state machine for worker threads.
 static void genericStateMachine(IdentTy *Ident) {
+  FunctionTracingRAII();
 
   uint32_t TId = mapping::getThreadIdInBlock();
 
@@ -66,6 +67,7 @@ extern "C" {
 ///
 int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
                            bool UseGenericStateMachine, bool) {
+  FunctionTracingRAII();
   const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
   if (IsSPMD) {
     inititializeRuntime(/* IsSPMD */ true);
@@ -98,6 +100,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
 /// \param Ident Source location identification, can be NULL.
 ///
 void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) {
+  FunctionTracingRAII();
   const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
   state::assumeInitialState(IsSPMD);
   if (IsSPMD)
@@ -107,7 +110,10 @@ void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) {
   state::ParallelRegionFn = nullptr;
 }
 
-int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
+int8_t __kmpc_is_spmd_exec_mode() {
+  FunctionTracingRAII();
+  return mapping::isSPMDMode();
+}
 }
 
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 740cc7be899f0..9bd26c80636ef 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -231,10 +231,12 @@ bool mapping::isGenericMode() { return !isSPMDMode(); }
 
 extern "C" {
 __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() {
+  FunctionTracingRAII();
   return mapping::getThreadIdInBlock();
 }
 
 __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
+  FunctionTracingRAII();
   return mapping::getNumberOfProcessorElements();
 }
 }

diff  --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
index 44fb85b552af9..7284be87896f7 100644
--- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -11,6 +11,8 @@
 
 #include "Types.h"
 
+#include "Debug.h"
+
 #pragma omp declare target
 
 namespace _OMP {
@@ -60,9 +62,15 @@ double getWTime() {
 ///{
 
 extern "C" {
-int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
+int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) {
+  FunctionTracingRAII();
+  return 0;
+}
 
-int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
+int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) {
+  FunctionTracingRAII();
+  return 0;
+}
 
 double omp_get_wtick(void) { return _OMP::impl::getWTick(); }
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index e80f046aa49b6..8dcda213b5fba 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -66,6 +66,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
                      void **args, int64_t nargs) {
+  DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>");
   switch (nargs) {
 #include "generated_microtask_cases.gen"
   default:
@@ -81,6 +82,7 @@ extern "C" {
 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                         int32_t num_threads, int proc_bind, void *fn,
                         void *wrapper_fn, void **args, int64_t nargs) {
+  FunctionTracingRAII();
 
   uint32_t TId = mapping::getThreadIdInBlock();
   // Handle the serialized case first, same for SPMD/non-SPMD.
@@ -171,6 +173,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 
 __attribute__((noinline)) bool
 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+  FunctionTracingRAII();
   // Work function and arguments for L1 parallel region.
   *WorkFn = state::ParallelRegionFn;
 
@@ -185,6 +188,7 @@ __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
 }
 
 __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
+  FunctionTracingRAII();
   // In case we have modified an ICV for this thread before a ThreadState was
   // created. We drop it now to not contaminate the next parallel region.
   ASSERT(!mapping::isSPMDMode());
@@ -193,18 +197,29 @@ __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
   ASSERT(!mapping::isSPMDMode());
 }
 
-uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
+uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
+  FunctionTracingRAII();
+  return omp_get_level();
+}
 
-int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
+int32_t __kmpc_global_thread_num(IdentTy *) {
+  FunctionTracingRAII();
+  return omp_get_thread_num();
+}
 
 void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
+  FunctionTracingRAII();
   icv::NThreads = NumThreads;
 }
 
 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
-                           int32_t thread_limit) {}
+                           int32_t thread_limit) {
+  FunctionTracingRAII();
+}
 
-void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
+void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {
+  FunctionTracingRAII();
+}
 }
 
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index 05efe956b38bf..dd1d30dd4cbfd 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -176,6 +176,7 @@ extern "C" {
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
+  FunctionTracingRAII();
   return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
                                       shflFct, cpyFct, mapping::isSPMDMode(),
                                       false);
@@ -186,6 +187,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
     ListGlobalFnTy glredFct) {
+  FunctionTracingRAII();
 
   // Terminate all threads in non-SPMD mode except for the master thread.
   uint32_t ThreadId = mapping::getThreadIdInBlock();
@@ -310,9 +312,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   return 0;
 }
 
-void __kmpc_nvptx_end_reduce(int32_t TId) {}
+void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); }
 
-void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); }
 }
 
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index f39b61c669308..54a191ce01fa6 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -498,10 +498,12 @@ int omp_get_initial_device(void) { return -1; }
 
 extern "C" {
 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
+  FunctionTracingRAII();
   return memory::allocShared(Bytes, "Frontend alloc shared");
 }
 
 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
+  FunctionTracingRAII();
   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 }
 
@@ -523,6 +525,7 @@ constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
     allocator(omp_pteam_mem_alloc)
 
 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
+  FunctionTracingRAII();
   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
   } else {
@@ -533,11 +536,13 @@ void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
 }
 
 void __kmpc_end_sharing_variables() {
+  FunctionTracingRAII();
   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
 }
 
 void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  FunctionTracingRAII();
   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }
 }

diff  --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 931dffcaa131e..e219c75d04ea6 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -321,16 +321,18 @@ uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) {
 }
 
 extern "C" {
-void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
+void __kmpc_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }
 
-void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }
 
 int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   __kmpc_barrier(Loc, TId);
   return 0;
 }
 
 void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   if (mapping::isMainThreadInGenericMode())
     return __kmpc_flush(Loc);
 
@@ -342,34 +344,49 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
 
 __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
                                                           int32_t TId) {
+  FunctionTracingRAII();
   synchronize::threadsAligned();
 }
 
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   return omp_get_team_num() == 0;
 }
 
-void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
+void __kmpc_end_master(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }
 
 int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   return __kmpc_master(Loc, TId);
 }
 
 void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
+  FunctionTracingRAII();
   // The barrier is explicitly called.
 }
 
-void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
+void __kmpc_flush(IdentTy *Loc) {
+  FunctionTracingRAII();
+  fence::kernel(__ATOMIC_SEQ_CST);
+}
 
-uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
+uint64_t __kmpc_warp_active_thread_mask(void) {
+  FunctionTracingRAII();
+  return mapping::activemask();
+}
 
-void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
+void __kmpc_syncwarp(uint64_t Mask) {
+  FunctionTracingRAII();
+  synchronize::warp(Mask);
+}
 
 void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  FunctionTracingRAII();
   omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));
 }
 
 void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  FunctionTracingRAII();
   omp_unset_lock(reinterpret_cast<omp_lock_t *>(Name));
 }
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
index 6b6991e772f2a..0416395b10545 100644
--- a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
@@ -26,6 +26,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
                                         uint64_t TaskSizeInclPrivateValues,
                                         uint64_t SharedValuesSize,
                                         TaskFnTy TaskFn) {
+  FunctionTracingRAII();
   auto TaskSizeInclPrivateValuesPadded =
       utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
   auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
@@ -40,12 +41,14 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
 
 int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
                         TaskDescriptorTy *TaskDescriptor) {
+  FunctionTracingRAII();
   return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
 }
 
 int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
                                   TaskDescriptorTy *TaskDescriptor, int32_t,
                                   void *, int32_t, void *) {
+  FunctionTracingRAII();
   state::DateEnvironmentRAII DERAII;
 
   TaskDescriptor->TaskFn(0, TaskDescriptor);
@@ -56,31 +59,42 @@ int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
 
 void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
                                TaskDescriptorTy *TaskDescriptor) {
+  FunctionTracingRAII();
   state::enterDataEnvironment();
 }
 
 void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
                                   TaskDescriptorTy *TaskDescriptor) {
+  FunctionTracingRAII();
   state::exitDataEnvironment();
 
   memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
 }
 
 void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
-                          void *) {}
+                          void *) {
+  FunctionTracingRAII();
+}
 
-void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) { FunctionTracingRAII(); }
 
-void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) { FunctionTracingRAII(); }
 
-int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) {
+  FunctionTracingRAII();
+  return 0;
+}
 
-int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) {
+  FunctionTracingRAII();
+  return 0;
+}
 
 void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
                      TaskDescriptorTy *TaskDescriptor, int,
                      uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
                      int32_t, uint64_t, void *) {
+  FunctionTracingRAII();
   // Skip task entirely if empty iteration space.
   if (*LowerBound > *UpperBound)
     return;

diff  --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
index 3f65f2166481a..8fcb96b158cf0 100644
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -11,6 +11,7 @@
 
 #include "Utils.h"
 
+#include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
 
@@ -129,10 +130,12 @@ int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
 
 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
+  FunctionTracingRAII();
   return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
 }
 
 int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
+  FunctionTracingRAII();
   uint32_t lo, hi;
   utils::unpack(Val, lo, hi);
   hi = impl::shuffleDown(lanes::All, hi, Delta, Width);

diff  --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
index 89c10b1fa11f7..24f3fee2aa5b4 100644
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -470,6 +470,7 @@ extern "C" {
 // init
 void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
                             int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -478,6 +479,7 @@ void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
 void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
                              uint32_t lb, uint32_t ub, int32_t st,
                              int32_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -485,6 +487,7 @@ void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
 
 void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
                             int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -493,6 +496,7 @@ void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
 void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
                              uint64_t lb, uint64_t ub, int64_t st,
                              int64_t chunk) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = pushDST();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
       loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
@@ -501,6 +505,7 @@ void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
 // next
 int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
                            int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -508,6 +513,7 @@ int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
 
 int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
                             uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -515,6 +521,7 @@ int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
 
 int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
                            int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -522,6 +529,7 @@ int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
 
 int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
                             uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
+  FunctionTracingRAII();
   DynamicScheduleTracker *DST = peekDST();
   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
       loc, tid, p_last, p_lb, p_ub, p_st, DST);
@@ -529,21 +537,25 @@ int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
 
 // fini
 void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
   popDST();
 }
 
 void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
   popDST();
 }
 
 void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
   popDST();
 }
 
 void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
   popDST();
 }
@@ -556,6 +568,7 @@ void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
                               int32_t schedtype, int32_t *plastiter,
                               int32_t *plower, int32_t *pupper,
                               int32_t *pstride, int32_t incr, int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -565,6 +578,7 @@ void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
                                int32_t schedtype, int32_t *plastiter,
                                uint32_t *plower, uint32_t *pupper,
                                int32_t *pstride, int32_t incr, int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -574,6 +588,7 @@ void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
                               int32_t schedtype, int32_t *plastiter,
                               int64_t *plower, int64_t *pupper,
                               int64_t *pstride, int64_t incr, int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -583,6 +598,7 @@ void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
                                int32_t schedtype, int32_t *plastiter,
                                uint64_t *plower, uint64_t *pupper,
                                int64_t *pstride, int64_t incr, int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -593,6 +609,7 @@ void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid,
                                      int32_t *plower, int32_t *pupper,
                                      int32_t *pstride, int32_t incr,
                                      int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -603,6 +620,7 @@ void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid,
                                       uint32_t *plower, uint32_t *pupper,
                                       int32_t *pstride, int32_t incr,
                                       int32_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -613,6 +631,7 @@ void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid,
                                      int64_t *plower, int64_t *pupper,
                                      int64_t *pstride, int64_t incr,
                                      int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
@@ -623,14 +642,19 @@ void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid,
                                       uint64_t *plower, uint64_t *pupper,
                                       int64_t *pstride, int64_t incr,
                                       int64_t chunk) {
+  FunctionTracingRAII();
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
       mapping::isSPMDMode());
 }
 
-void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
+void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {
+  FunctionTracingRAII();
+}
 
-void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {}
+void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
+  FunctionTracingRAII();
+}
 }
 
 #pragma omp end declare target


        


More information about the Openmp-commits mailing list