[Openmp-commits] [openmp] 3356e26 - [OpenMP] Implementation of OMPT reduction callbacks

via Openmp-commits openmp-commits at lists.llvm.org
Fri Dec 27 06:31:10 PST 2019


Author: protze at itc.rwth-aachen.de
Date: 2019-12-27T15:30:51+01:00
New Revision: 3356e268f6cdfd4815da70ae4fc6b8db63163375

URL: https://github.com/llvm/llvm-project/commit/3356e268f6cdfd4815da70ae4fc6b8db63163375
DIFF: https://github.com/llvm/llvm-project/commit/3356e268f6cdfd4815da70ae4fc6b8db63163375.diff

LOG: [OpenMP] Implementation of OMPT reduction callbacks

Including two tests
These callbacks were added late to the 5.0 specification, an implementation is missing.

Reviewed By: jdoerfert

Differential Review: https://reviews.llvm.org/D70395

Added: 
    openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c
    openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c

Modified: 
    openmp/runtime/src/kmp_barrier.cpp
    openmp/runtime/src/kmp_csupport.cpp
    openmp/runtime/src/ompt-event-specific.h
    openmp/runtime/src/ompt-specific.h
    openmp/runtime/test/ompt/callback.h

Removed: 
    


################################################################################
diff  --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index e17986b16a95..a6d87b5d7a2e 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -15,9 +15,7 @@
 #include "kmp_itt.h"
 #include "kmp_os.h"
 #include "kmp_stats.h"
-#if OMPT_SUPPORT
 #include "ompt-specific.h"
-#endif
 
 #if KMP_MIC
 #include <immintrin.h>
@@ -128,8 +126,11 @@ static bool __kmp_linear_barrier_gather_template(
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
                   team->t.t_id, i));
         ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   other_threads[i]->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
         ANNOTATE_REDUCE_BEFORE(reduce);
         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
@@ -355,8 +356,11 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                   team->t.t_id, child_tid));
         ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
         ANNOTATE_REDUCE_BEFORE(reduce);
         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
@@ -600,8 +604,11 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                   gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                   team->t.t_id, child_tid));
         ANNOTATE_REDUCE_AFTER(reduce);
+        OMPT_REDUCTION_DECL(this_thr, gtid);
+        OMPT_REDUCTION_BEGIN;
         (*reduce)(this_thr->th.th_local.reduce_data,
                   child_thr->th.th_local.reduce_data);
+        OMPT_REDUCTION_END;
         ANNOTATE_REDUCE_BEFORE(reduce);
         ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
       }
@@ -912,6 +919,8 @@ static void __kmp_hierarchical_barrier_gather(
         flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
         if (reduce) {
           ANNOTATE_REDUCE_AFTER(reduce);
+          OMPT_REDUCTION_DECL(this_thr, gtid);
+          OMPT_REDUCTION_BEGIN;
           for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
                ++child_tid) {
             KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
@@ -923,6 +932,7 @@ static void __kmp_hierarchical_barrier_gather(
             (*reduce)(this_thr->th.th_local.reduce_data,
                       other_threads[child_tid]->th.th_local.reduce_data);
           }
+          OMPT_REDUCTION_END;
           ANNOTATE_REDUCE_BEFORE(reduce);
           ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
         }

diff  --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index d39bf9af4334..ac9a93590ad0 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -18,10 +18,7 @@
 #include "kmp_itt.h"
 #include "kmp_lock.h"
 #include "kmp_stats.h"
-
-#if OMPT_SUPPORT
 #include "ompt-specific.h"
-#endif
 
 #define MAX_MESSAGE 512
 
@@ -3429,13 +3426,18 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
 
+  OMPT_REDUCTION_DECL(th, global_tid);
   if (packed_reduction_method == critical_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
+
     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
     retval = 1;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
+
     // usage: if team size == 1, no synchronization is required ( Intel
     // platforms only )
     retval = 1;
@@ -3536,15 +3538,20 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
 
   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
 
+  OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
+
   if (packed_reduction_method == critical_reduce_block) {
 
     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+    OMPT_REDUCTION_END;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
     // usage: if team size == 1, no synchronization is required ( on Intel
     // platforms only )
 
+    OMPT_REDUCTION_END;
+
   } else if (packed_reduction_method == atomic_reduce_block) {
 
     // neither master nor other workers should get here
@@ -3556,6 +3563,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
                                    tree_reduce_block)) {
 
     // only master gets here
+    // OMPT: tree reduction is annotated in the barrier code
 
   } else {
 
@@ -3629,13 +3637,17 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
 
+  OMPT_REDUCTION_DECL(th, global_tid);
+
   if (packed_reduction_method == critical_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
     retval = 1;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
+    OMPT_REDUCTION_BEGIN;
     // usage: if team size == 1, no synchronization is required ( Intel
     // platforms only )
     retval = 1;
@@ -3723,10 +3735,13 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
 
   // this barrier should be visible to a customer and to the threading profile
   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
+  OMPT_REDUCTION_DECL(th, global_tid);
 
   if (packed_reduction_method == critical_reduce_block) {
     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
 
+    OMPT_REDUCTION_END;
+
 // TODO: implicit barrier: should be exposed
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
@@ -3749,6 +3764,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
+    OMPT_REDUCTION_END;
+
 // usage: if team size==1, no synchronization is required (Intel platforms only)
 
 // TODO: implicit barrier: should be exposed

diff  --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h
index da6a0e424726..a5901b511148 100644
--- a/openmp/runtime/src/ompt-event-specific.h
+++ b/openmp/runtime/src/ompt-event-specific.h
@@ -99,7 +99,7 @@
 
 #define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
-#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL
 
 #define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED
 

diff  --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h
index 47d8a1669846..5ba240c1a950 100644
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -15,6 +15,7 @@
 
 #include "kmp.h"
 
+#if OMPT_SUPPORT
 /*****************************************************************************
  * forward declarations
  ****************************************************************************/
@@ -101,5 +102,30 @@ inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
 inline const char *ompt_get_runtime_version() {
   return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
 }
+#endif // OMPT_SUPPRORT
+
+// macros providing the OMPT callbacks for reduction clause
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)                                    \
+  ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr);                    \
+  ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);                \
+  void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+#define OMPT_REDUCTION_BEGIN                                                   \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data,        \
+        my_task_data, return_address);                                         \
+  }
+#define OMPT_REDUCTION_END                                                     \
+  if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) {          \
+    ompt_callbacks.ompt_callback(ompt_callback_reduction)(                     \
+        ompt_sync_region_reduction, ompt_scope_end, my_parallel_data,          \
+        my_task_data, return_address);                                         \
+  }
+#else // OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)
+#define OMPT_REDUCTION_BEGIN
+#define OMPT_REDUCTION_END
+#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL
 
 #endif

diff  --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h
index 64463ec83db5..a5ba897c6ec1 100644
--- a/openmp/runtime/test/ompt/callback.h
+++ b/openmp/runtime/test/ompt/callback.h
@@ -358,6 +358,9 @@ on_ompt_callback_sync_region(
           printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region\n");
+          exit(-1);
           break;
       }
       break;
@@ -377,6 +380,9 @@ on_ompt_callback_sync_region(
           printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region\n");
+          exit(-1);
           break;
       }
       break;
@@ -409,6 +415,9 @@ on_ompt_callback_sync_region_wait(
           printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region_wait\n");
+          exit(-1);
           break;
       }
       break;
@@ -428,12 +437,38 @@ on_ompt_callback_sync_region_wait(
           printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
           break;
         case ompt_sync_region_reduction:
+          printf("ompt_sync_region_reduction should never be passed to "
+                 "on_ompt_callback_sync_region_wait\n");
+          exit(-1);
           break;
       }
       break;
   }
 }
 
+static void on_ompt_callback_reduction(ompt_sync_region_t kind,
+                                       ompt_scope_endpoint_t endpoint,
+                                       ompt_data_t *parallel_data,
+                                       ompt_data_t *task_data,
+                                       const void *codeptr_ra) {
+  switch (endpoint) {
+  case ompt_scope_begin:
+    printf("%" PRIu64 ": ompt_event_reduction_begin: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+           ompt_get_thread_data()->value,
+           (parallel_data) ? parallel_data->value : 0, task_data->value,
+           codeptr_ra);
+    break;
+  case ompt_scope_end:
+    printf("%" PRIu64 ": ompt_event_reduction_end: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+           ompt_get_thread_data()->value,
+           (parallel_data) ? parallel_data->value : 0, task_data->value,
+           codeptr_ra);
+    break;
+  }
+}
+
 static void
 on_ompt_callback_flush(
     ompt_data_t *thread_data,
@@ -784,6 +819,7 @@ int ompt_initialize(
   register_callback(ompt_callback_nest_lock);
   register_callback(ompt_callback_sync_region);
   register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t);
   register_callback(ompt_callback_control_tool);
   register_callback(ompt_callback_flush);
   register_callback(ompt_callback_cancel);

diff  --git a/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c
new file mode 100644
index 000000000000..ca032984d50e
--- /dev/null
+++ b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+  int sum = 0;
+  int i;
+#pragma omp parallel num_threads(1)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+  for (i = 0; i < 10000; i++) {
+    sum += i;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+  // CHECK-SAME: codeptr_ra=
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]],
+  // CHECK-SAME: task_id=[[TASK_ID]], codeptr_ra=
+
+  return 0;
+}

diff  --git a/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c
new file mode 100644
index 000000000000..2c73fe139004
--- /dev/null
+++ b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c
@@ -0,0 +1,48 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+  int sum = 0;
+  int i;
+#pragma omp parallel num_threads(5)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+  for (i = 0; i < 10000; i++) {
+    sum += i;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+  // order and distribution to threads not determined
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+  // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+
+  return 0;
+}


        


More information about the Openmp-commits mailing list