[Openmp-commits] [openmp] 3356e26 - [OpenMP] Implementation of OMPT reduction callbacks
via Openmp-commits
openmp-commits at lists.llvm.org
Fri Dec 27 06:31:10 PST 2019
Author: protze at itc.rwth-aachen.de
Date: 2019-12-27T15:30:51+01:00
New Revision: 3356e268f6cdfd4815da70ae4fc6b8db63163375
URL: https://github.com/llvm/llvm-project/commit/3356e268f6cdfd4815da70ae4fc6b8db63163375
DIFF: https://github.com/llvm/llvm-project/commit/3356e268f6cdfd4815da70ae4fc6b8db63163375.diff
LOG: [OpenMP] Implementation of OMPT reduction callbacks
Including two tests
These callbacks were added late to the 5.0 specification, an implementation is missing.
Reviewed By: jdoerfert
Differential Review: https://reviews.llvm.org/D70395
Added:
openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c
openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c
Modified:
openmp/runtime/src/kmp_barrier.cpp
openmp/runtime/src/kmp_csupport.cpp
openmp/runtime/src/ompt-event-specific.h
openmp/runtime/src/ompt-specific.h
openmp/runtime/test/ompt/callback.h
Removed:
################################################################################
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index e17986b16a95..a6d87b5d7a2e 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -15,9 +15,7 @@
#include "kmp_itt.h"
#include "kmp_os.h"
#include "kmp_stats.h"
-#if OMPT_SUPPORT
#include "ompt-specific.h"
-#endif
#if KMP_MIC
#include <immintrin.h>
@@ -128,8 +126,11 @@ static bool __kmp_linear_barrier_gather_template(
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
team->t.t_id, i));
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
(*reduce)(this_thr->th.th_local.reduce_data,
other_threads[i]->th.th_local.reduce_data);
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
@@ -355,8 +356,11 @@ __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
team->t.t_id, child_tid));
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
(*reduce)(this_thr->th.th_local.reduce_data,
child_thr->th.th_local.reduce_data);
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
@@ -600,8 +604,11 @@ __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
team->t.t_id, child_tid));
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
(*reduce)(this_thr->th.th_local.reduce_data,
child_thr->th.th_local.reduce_data);
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
@@ -912,6 +919,8 @@ static void __kmp_hierarchical_barrier_gather(
flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
if (reduce) {
ANNOTATE_REDUCE_AFTER(reduce);
+ OMPT_REDUCTION_DECL(this_thr, gtid);
+ OMPT_REDUCTION_BEGIN;
for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
++child_tid) {
KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
@@ -923,6 +932,7 @@ static void __kmp_hierarchical_barrier_gather(
(*reduce)(this_thr->th.th_local.reduce_data,
other_threads[child_tid]->th.th_local.reduce_data);
}
+ OMPT_REDUCTION_END;
ANNOTATE_REDUCE_BEFORE(reduce);
ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
}
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index d39bf9af4334..ac9a93590ad0 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -18,10 +18,7 @@
#include "kmp_itt.h"
#include "kmp_lock.h"
#include "kmp_stats.h"
-
-#if OMPT_SUPPORT
#include "ompt-specific.h"
-#endif
#define MAX_MESSAGE 512
@@ -3429,13 +3426,18 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
__KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+ OMPT_REDUCTION_DECL(th, global_tid);
if (packed_reduction_method == critical_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
+
__kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
retval = 1;
} else if (packed_reduction_method == empty_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
+
// usage: if team size == 1, no synchronization is required ( Intel
// platforms only )
retval = 1;
@@ -3536,15 +3538,20 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
+ OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
+
if (packed_reduction_method == critical_reduce_block) {
__kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+ OMPT_REDUCTION_END;
} else if (packed_reduction_method == empty_reduce_block) {
// usage: if team size == 1, no synchronization is required ( on Intel
// platforms only )
+ OMPT_REDUCTION_END;
+
} else if (packed_reduction_method == atomic_reduce_block) {
// neither master nor other workers should get here
@@ -3556,6 +3563,7 @@ void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
tree_reduce_block)) {
// only master gets here
+ // OMPT: tree reduction is annotated in the barrier code
} else {
@@ -3629,13 +3637,17 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
__KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+ OMPT_REDUCTION_DECL(th, global_tid);
+
if (packed_reduction_method == critical_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
__kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
retval = 1;
} else if (packed_reduction_method == empty_reduce_block) {
+ OMPT_REDUCTION_BEGIN;
// usage: if team size == 1, no synchronization is required ( Intel
// platforms only )
retval = 1;
@@ -3723,10 +3735,13 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
// this barrier should be visible to a customer and to the threading profile
// tool (it's a terminating barrier on constructs if NOWAIT not specified)
+ OMPT_REDUCTION_DECL(th, global_tid);
if (packed_reduction_method == critical_reduce_block) {
__kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+ OMPT_REDUCTION_END;
+
// TODO: implicit barrier: should be exposed
#if OMPT_SUPPORT
ompt_frame_t *ompt_frame;
@@ -3749,6 +3764,8 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
} else if (packed_reduction_method == empty_reduce_block) {
+ OMPT_REDUCTION_END;
+
// usage: if team size==1, no synchronization is required (Intel platforms only)
// TODO: implicit barrier: should be exposed
diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h
index da6a0e424726..a5901b511148 100644
--- a/openmp/runtime/src/ompt-event-specific.h
+++ b/openmp/runtime/src/ompt-event-specific.h
@@ -99,7 +99,7 @@
#define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
-#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL
#define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED
diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h
index 47d8a1669846..5ba240c1a950 100644
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -15,6 +15,7 @@
#include "kmp.h"
+#if OMPT_SUPPORT
/*****************************************************************************
* forward declarations
****************************************************************************/
@@ -101,5 +102,30 @@ inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
inline const char *ompt_get_runtime_version() {
return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
}
+#endif // OMPT_SUPPRORT
+
+// macros providing the OMPT callbacks for reduction clause
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid) \
+ ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); \
+ ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); \
+ void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+#define OMPT_REDUCTION_BEGIN \
+ if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \
+ ompt_callbacks.ompt_callback(ompt_callback_reduction)( \
+ ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, \
+ my_task_data, return_address); \
+ }
+#define OMPT_REDUCTION_END \
+ if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \
+ ompt_callbacks.ompt_callback(ompt_callback_reduction)( \
+ ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, \
+ my_task_data, return_address); \
+ }
+#else // OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_REDUCTION_DECL(this_thr, gtid)
+#define OMPT_REDUCTION_BEGIN
+#define OMPT_REDUCTION_END
+#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL
#endif
diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h
index 64463ec83db5..a5ba897c6ec1 100644
--- a/openmp/runtime/test/ompt/callback.h
+++ b/openmp/runtime/test/ompt/callback.h
@@ -358,6 +358,9 @@ on_ompt_callback_sync_region(
printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region\n");
+ exit(-1);
break;
}
break;
@@ -377,6 +380,9 @@ on_ompt_callback_sync_region(
printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region\n");
+ exit(-1);
break;
}
break;
@@ -409,6 +415,9 @@ on_ompt_callback_sync_region_wait(
printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region_wait\n");
+ exit(-1);
break;
}
break;
@@ -428,12 +437,38 @@ on_ompt_callback_sync_region_wait(
printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
break;
case ompt_sync_region_reduction:
+ printf("ompt_sync_region_reduction should never be passed to "
+ "on_ompt_callback_sync_region_wait\n");
+ exit(-1);
break;
}
break;
}
}
+static void on_ompt_callback_reduction(ompt_sync_region_t kind,
+ ompt_scope_endpoint_t endpoint,
+ ompt_data_t *parallel_data,
+ ompt_data_t *task_data,
+ const void *codeptr_ra) {
+ switch (endpoint) {
+ case ompt_scope_begin:
+ printf("%" PRIu64 ": ompt_event_reduction_begin: parallel_id=%" PRIu64
+ ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+ ompt_get_thread_data()->value,
+ (parallel_data) ? parallel_data->value : 0, task_data->value,
+ codeptr_ra);
+ break;
+ case ompt_scope_end:
+ printf("%" PRIu64 ": ompt_event_reduction_end: parallel_id=%" PRIu64
+ ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+ ompt_get_thread_data()->value,
+ (parallel_data) ? parallel_data->value : 0, task_data->value,
+ codeptr_ra);
+ break;
+ }
+}
+
static void
on_ompt_callback_flush(
ompt_data_t *thread_data,
@@ -784,6 +819,7 @@ int ompt_initialize(
register_callback(ompt_callback_nest_lock);
register_callback(ompt_callback_sync_region);
register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+ register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t);
register_callback(ompt_callback_control_tool);
register_callback(ompt_callback_flush);
register_callback(ompt_callback_cancel);
diff --git a/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c
new file mode 100644
index 000000000000..ca032984d50e
--- /dev/null
+++ b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+ int sum = 0;
+ int i;
+#pragma omp parallel num_threads(1)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+ for (i = 0; i < 10000; i++) {
+ sum += i;
+ }
+
+ // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+ // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+ // CHECK-SAME: codeptr_ra=
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]],
+ // CHECK-SAME: task_id=[[TASK_ID]], codeptr_ra=
+
+ return 0;
+}
diff --git a/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c
new file mode 100644
index 000000000000..2c73fe139004
--- /dev/null
+++ b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c
@@ -0,0 +1,48 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc
+#include "callback.h"
+#include <omp.h>
+
+#ifdef NOWAIT
+#define FOR_CLAUSE nowait
+#else
+#define FOR_CLAUSE
+#endif
+
+int main() {
+ int sum = 0;
+ int i;
+#pragma omp parallel num_threads(5)
+#pragma omp for reduction(+ : sum) FOR_CLAUSE
+ for (i = 0; i < 10000; i++) {
+ sum += i;
+ }
+
+ // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+ // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+ // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]]
+
+ // order and distribution to threads not determined
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+ // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end:
+ // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}
+
+ return 0;
+}
More information about the Openmp-commits
mailing list