[Openmp-commits] [openmp] r359710 - [OpenMP] Implement task modifier for reduction clause
Jonathan Peyton via Openmp-commits
openmp-commits at lists.llvm.org
Wed May 1 10:54:01 PDT 2019
Author: jlpeyton
Date: Wed May 1 10:54:01 2019
New Revision: 359710
URL: http://llvm.org/viewvc/llvm-project?rev=359710&view=rev
Log:
[OpenMP] Implement task modifier for reduction clause
Implemented task modifier in two versions - one without taking into account
omp_orig variable (the omp_orig still can be processed by compiler without help
of the library, but each reduction object will need separate initializer with
global access to omp_orig), another with omp_orig variable included into
interface (single initializer can be used for multiple reduction objects of
the same type). Second version can be used when the omp_orig is not globally
accessible, or to optimize code in case of multiple reduction objects
of the same type.
Patch by Andrey Churbanov
Differential Revision: https://reviews.llvm.org/D60976
Added:
openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
Modified:
openmp/trunk/runtime/src/dllexports
openmp/trunk/runtime/src/kmp.h
openmp/trunk/runtime/src/kmp_tasking.cpp
Modified: openmp/trunk/runtime/src/dllexports
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/dllexports?rev=359710&r1=359709&r2=359710&view=diff
==============================================================================
--- openmp/trunk/runtime/src/dllexports (original)
+++ openmp/trunk/runtime/src/dllexports Wed May 1 10:54:01 2019
@@ -404,8 +404,13 @@ kmpc_set_disp_num_buffers
__kmpc_task_reduction_get_th_data 269
# USED FOR 4.5 __kmpc_critical_with_hint 270
__kmpc_get_target_offload 271
- __kmpc_omp_reg_task_with_affinity 272
- __kmpc_pause_resource 273
+ __kmpc_omp_reg_task_with_affinity 272
+ __kmpc_pause_resource 273
+ __kmpc_task_reduction_modifier_init 274
+ __kmpc_task_reduction_modifier_fini 275
+ # __kmpc_task_allow_completion_event 276
+ __kmpc_taskred_init 277
+ __kmpc_taskred_modifier_init 278
%endif
%endif
Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=359710&r1=359709&r2=359710&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Wed May 1 10:54:01 2019
@@ -2682,6 +2682,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_
std::atomic<int> t_construct; // count of single directive encountered by team
char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
+ // [0] - parallel / [1] - worksharing task reduction data shared by taskgroups
+ std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
+ std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
+
// Master only
// ---------------------------------------------------------------------------
KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
@@ -3818,7 +3822,15 @@ KMP_EXPORT void __kmpc_taskloop(ident_t
#endif
#if OMP_50_ENABLED
KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data);
+KMP_EXPORT void *__kmpc_taskred_init(int gtid, int num_data, void *data);
KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
+KMP_EXPORT void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid,
+ int is_ws, int num,
+ void *data);
+KMP_EXPORT void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws,
+ int num, void *data);
+KMP_EXPORT void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid,
+ int is_ws);
KMP_EXPORT kmp_int32 __kmpc_omp_reg_task_with_affinity(
ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins,
kmp_task_affinity_info_t *affin_list);
Modified: openmp/trunk/runtime/src/kmp_tasking.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_tasking.cpp?rev=359710&r1=359709&r2=359710&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_tasking.cpp (original)
+++ openmp/trunk/runtime/src/kmp_tasking.cpp Wed May 1 10:54:01 2019
@@ -2019,49 +2019,109 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *
#if OMP_50_ENABLED
// Task Reduction implementation
+//
+// Note: initial implementation didn't take into account the possibility
+// to specify omp_orig for initializer of the UDR (user defined reduction).
+// Corrected implementation takes into account the omp_orig object.
+// Compiler is free to use old implementation if omp_orig is not specified.
-typedef struct kmp_task_red_flags {
- unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
- unsigned reserved31 : 31;
-} kmp_task_red_flags_t;
+/*!
+ at ingroup BASIC_TYPES
+@{
+*/
-// internal structure for reduction data item related info
-typedef struct kmp_task_red_data {
- void *reduce_shar; // shared reduction item
- size_t reduce_size; // size of data item
- void *reduce_priv; // thread specific data
- void *reduce_pend; // end of private data for comparison op
- void *reduce_init; // data initialization routine
- void *reduce_fini; // data finalization routine
- void *reduce_comb; // data combiner routine
- kmp_task_red_flags_t flags; // flags for additional info from compiler
-} kmp_task_red_data_t;
+/*!
+Flags for special info per task reduction item.
+*/
+typedef struct kmp_taskred_flags {
+ /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
+ unsigned lazy_priv : 1;
+ unsigned reserved31 : 31;
+} kmp_taskred_flags_t;
-// structure sent us by compiler - one per reduction item
+/*!
+Internal struct for reduction data item related info set up by compiler.
+*/
typedef struct kmp_task_red_input {
- void *reduce_shar; // shared reduction item
- size_t reduce_size; // size of data item
- void *reduce_init; // data initialization routine
- void *reduce_fini; // data finalization routine
- void *reduce_comb; // data combiner routine
- kmp_task_red_flags_t flags; // flags for additional info from compiler
+ void *reduce_shar; /**< shared between tasks item to reduce into */
+ size_t reduce_size; /**< size of data item in bytes */
+ // three compiler-generated routines (init, fini are optional):
+ void *reduce_init; /**< data initialization routine (single parameter) */
+ void *reduce_fini; /**< data finalization routine */
+ void *reduce_comb; /**< data combiner routine */
+ kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
} kmp_task_red_input_t;
/*!
- at ingroup TASKING
- at param gtid Global thread ID
- at param num Number of data items to reduce
- at param data Array of data for reduction
- at return The taskgroup identifier
+Internal struct for reduction data item related info saved by the library.
+*/
+typedef struct kmp_taskred_data {
+ void *reduce_shar; /**< shared between tasks item to reduce into */
+ size_t reduce_size; /**< size of data item */
+ kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+ void *reduce_priv; /**< array of thread specific items */
+ void *reduce_pend; /**< end of private data for faster comparison op */
+ // three compiler-generated routines (init, fini are optional):
+ void *reduce_comb; /**< data combiner routine */
+ void *reduce_init; /**< data initialization routine (two parameters) */
+ void *reduce_fini; /**< data finalization routine */
+ void *reduce_orig; /**< original item (can be used in UDR initializer) */
+} kmp_taskred_data_t;
-Initialize task reduction for the taskgroup.
+/*!
+Internal struct for reduction data item related info set up by compiler.
+
+New interface: added reduce_orig field to provide omp_orig for UDR initializer.
*/
-void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+typedef struct kmp_taskred_input {
+ void *reduce_shar; /**< shared between tasks item to reduce into */
+ void *reduce_orig; /**< original reduction item used for initialization */
+ size_t reduce_size; /**< size of data item */
+ // three compiler-generated routines (init, fini are optional):
+ void *reduce_init; /**< data initialization routine (two parameters) */
+ void *reduce_fini; /**< data finalization routine */
+ void *reduce_comb; /**< data combiner routine */
+ kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+} kmp_taskred_input_t;
+/*!
+@}
+*/
+
+template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
+template <>
+void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+ kmp_task_red_input_t &src) {
+ item.reduce_orig = NULL;
+}
+template <>
+void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+ kmp_taskred_input_t &src) {
+ if (src.reduce_orig != NULL) {
+ item.reduce_orig = src.reduce_orig;
+ } else {
+ item.reduce_orig = src.reduce_shar;
+ } // non-NULL reduce_orig means new interface used
+}
+
+template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
+template <>
+void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+ int offset) {
+ ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
+}
+template <>
+void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+ int offset) {
+ ((void (*)(void *, void *))item.reduce_init)(
+ (char *)(item.reduce_priv) + offset, item.reduce_orig);
+}
+
+template <typename T>
+void *__kmp_task_reduction_init(int gtid, int num, T *data) {
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
kmp_int32 nth = thread->th.th_team_nproc;
- kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
- kmp_task_red_data_t *arr;
+ kmp_taskred_data_t *arr;
// check input data just in case
KMP_ASSERT(tg != NULL);
@@ -2074,33 +2134,34 @@ void *__kmpc_task_reduction_init(int gti
}
KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
gtid, tg, num));
- arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
- thread, num * sizeof(kmp_task_red_data_t));
+ arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+ thread, num * sizeof(kmp_taskred_data_t));
for (int i = 0; i < num; ++i) {
- void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
- size_t size = input[i].reduce_size - 1;
+ size_t size = data[i].reduce_size - 1;
// round the size up to cache line per thread-specific item
size += CACHE_LINE - size % CACHE_LINE;
- KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
- arr[i].reduce_shar = input[i].reduce_shar;
+ KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
+ arr[i].reduce_shar = data[i].reduce_shar;
arr[i].reduce_size = size;
- arr[i].reduce_init = input[i].reduce_init;
- arr[i].reduce_fini = input[i].reduce_fini;
- arr[i].reduce_comb = input[i].reduce_comb;
- arr[i].flags = input[i].flags;
- if (!input[i].flags.lazy_priv) {
+ arr[i].flags = data[i].flags;
+ arr[i].reduce_comb = data[i].reduce_comb;
+ arr[i].reduce_init = data[i].reduce_init;
+ arr[i].reduce_fini = data[i].reduce_fini;
+ __kmp_assign_orig<T>(arr[i], data[i]);
+ if (!arr[i].flags.lazy_priv) {
// allocate cache-line aligned block and fill it with zeros
arr[i].reduce_priv = __kmp_allocate(nth * size);
arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
- if (f_init != NULL) {
- // initialize thread-specific items
+ if (arr[i].reduce_init != NULL) {
+ // initialize all thread-specific items
for (int j = 0; j < nth; ++j) {
- f_init((char *)(arr[i].reduce_priv) + j * size);
+ __kmp_call_init<T>(arr[i], j * size);
}
}
} else {
// only allocate space for pointers now,
- // objects will be lazily allocated/initialized once requested
+ // objects will be lazily allocated/initialized if/when requested
+ // note that __kmp_allocate zeroes the allocated memory
arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
}
}
@@ -2111,6 +2172,59 @@ void *__kmpc_task_reduction_init(int gti
/*!
@ingroup TASKING
+ at param gtid Global thread ID
+ at param num Number of data items to reduce
+ at param data Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+ return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
+}
+
+/*!
+ at ingroup TASKING
+ at param gtid Global thread ID
+ at param num Number of data items to reduce
+ at param data Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_init(int gtid, int num, void *data) {
+ return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
+}
+
+// Copy task reduction data (except for shared pointers).
+template <typename T>
+void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
+ kmp_taskgroup_t *tg, void *reduce_data) {
+ kmp_taskred_data_t *arr;
+ KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
+ " from data %p\n",
+ thr, tg, reduce_data));
+ arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+ thr, num * sizeof(kmp_taskred_data_t));
+ // threads will share private copies, thunk routines, sizes, flags, etc.:
+ KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
+ for (int i = 0; i < num; ++i) {
+ arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
+ }
+ tg->reduce_data = (void *)arr;
+ tg->reduce_num_data = num;
+}
+
+/*!
+ at ingroup TASKING
@param gtid Global thread ID
@param tskgrp The taskgroup ID (optional)
@param data Shared location of the item
@@ -2128,7 +2242,7 @@ void *__kmpc_task_reduction_get_th_data(
if (tg == NULL)
tg = thread->th.th_current_task->td_taskgroup;
KMP_ASSERT(tg != NULL);
- kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
+ kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
kmp_int32 num = tg->reduce_num_data;
kmp_int32 tid = thread->th.th_info.ds.ds_tid;
@@ -2152,17 +2266,21 @@ void *__kmpc_task_reduction_get_th_data(
found:
if (p_priv[tid] == NULL) {
// allocate thread specific object lazily
- void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
- if (f_init != NULL) {
- f_init(p_priv[tid]);
+ if (arr[i].reduce_init != NULL) {
+ if (arr[i].reduce_orig != NULL) { // new interface
+ ((void (*)(void *, void *))arr[i].reduce_init)(
+ p_priv[tid], arr[i].reduce_orig);
+ } else { // old interface (single parameter)
+ ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
+ }
}
}
return p_priv[tid];
}
}
tg = tg->parent;
- arr = (kmp_task_red_data_t *)(tg->reduce_data);
+ arr = (kmp_taskred_data_t *)(tg->reduce_data);
num = tg->reduce_num_data;
}
KMP_ASSERT2(0, "Unknown task reduction item");
@@ -2174,7 +2292,7 @@ void *__kmpc_task_reduction_get_th_data(
static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
kmp_int32 nth = th->th.th_team_nproc;
KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
- kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
+ kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
kmp_int32 num = tg->reduce_num_data;
for (int i = 0; i < num; ++i) {
void *sh_data = arr[i].reduce_shar;
@@ -2207,6 +2325,111 @@ static void __kmp_task_reduction_fini(km
tg->reduce_data = NULL;
tg->reduce_num_data = 0;
}
+
+// Cleanup task reduction data for parallel or worksharing,
+// do not touch task private data other threads still working with.
+// Called from __kmpc_end_taskgroup()
+static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
+ __kmp_thread_free(th, tg->reduce_data);
+ tg->reduce_data = NULL;
+ tg->reduce_num_data = 0;
+}
+
+template <typename T>
+void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+ int num, T *data) {
+ kmp_info_t *thr = __kmp_threads[gtid];
+ kmp_int32 nth = thr->th.th_team_nproc;
+ __kmpc_taskgroup(loc, gtid); // form new taskgroup first
+ if (nth == 1) {
+ KA_TRACE(10,
+ ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
+ gtid, thr->th.th_current_task->td_taskgroup));
+ return (void *)thr->th.th_current_task->td_taskgroup;
+ }
+ kmp_team_t *team = thr->th.th_team;
+ void *reduce_data;
+ kmp_taskgroup_t *tg;
+ reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
+ if (reduce_data == NULL &&
+ __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
+ (void *)1)) {
+ // single thread enters this block to initialize common reduction data
+ KMP_DEBUG_ASSERT(reduce_data == NULL);
+ // first initialize own data, then make a copy other threads can use
+ tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
+ reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
+ KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
+ // fini counters should be 0 at this point
+ KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
+ KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
+ KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
+ } else {
+ while (
+ (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
+ (void *)1) { // wait for task reduction initialization
+ KMP_CPU_PAUSE();
+ }
+ KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
+ tg = thr->th.th_current_task->td_taskgroup;
+ __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
+ }
+ return tg;
+}
+
+/*!
+ at ingroup TASKING
+ at param loc Source location info
+ at param gtid Global thread ID
+ at param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
+ at param num Number of data items to reduce
+ at param data Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+ int num, void *data) {
+ return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+ (kmp_task_red_input_t *)data);
+}
+
+/*!
+ at ingroup TASKING
+ at param loc Source location info
+ at param gtid Global thread ID
+ at param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
+ at param num Number of data items to reduce
+ at param data Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
+ void *data) {
+ return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+ (kmp_taskred_input_t *)data);
+}
+
+/*!
+ at ingroup TASKING
+ at param loc Source location info
+ at param gtid Global thread ID
+ at param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
+
+Finalize task reduction for a parallel or worksharing.
+*/
+void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
+ __kmpc_end_taskgroup(loc, gtid);
+}
#endif
#if OMP_40_ENABLED
@@ -2326,8 +2549,54 @@ void __kmpc_end_taskgroup(ident_t *loc,
KMP_DEBUG_ASSERT(taskgroup->count == 0);
#if OMP_50_ENABLED
- if (taskgroup->reduce_data != NULL) // need to reduce?
- __kmp_task_reduction_fini(thread, taskgroup);
+ if (taskgroup->reduce_data != NULL) { // need to reduce?
+ int cnt;
+ void *reduce_data;
+ kmp_team_t *t = thread->th.th_team;
+ kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
+ // check if <priv> data of the first reduction variable shared for the team
+ void *priv0 = arr[0].reduce_priv;
+ if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
+ ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+ // finishing task reduction on parallel
+ cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
+ if (cnt == thread->th.th_team_nproc - 1) {
+ // we are the last thread passing __kmpc_reduction_modifier_fini()
+ // finalize task reduction:
+ __kmp_task_reduction_fini(thread, taskgroup);
+ // cleanup fields in the team structure:
+ // TODO: is relaxed store enough here (whole barrier should follow)?
+ __kmp_thread_free(thread, reduce_data);
+ KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
+ KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
+ } else {
+ // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+ // so do not finalize reduction, just clean own copy of the data
+ __kmp_task_reduction_clean(thread, taskgroup);
+ }
+ } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
+ NULL &&
+ ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+ // finishing task reduction on worksharing
+ cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
+ if (cnt == thread->th.th_team_nproc - 1) {
+ // we are the last thread passing __kmpc_reduction_modifier_fini()
+ __kmp_task_reduction_fini(thread, taskgroup);
+ // cleanup fields in team structure:
+ // TODO: is relaxed store enough here (whole barrier should follow)?
+ __kmp_thread_free(thread, reduce_data);
+ KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
+ KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
+ } else {
+ // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+ // so do not finalize reduction, just clean own copy of the data
+ __kmp_task_reduction_clean(thread, taskgroup);
+ }
+ } else {
+ // finishing task reduction on taskgroup
+ __kmp_task_reduction_fini(thread, taskgroup);
+ }
+ }
#endif
// Restore parent taskgroup for the current task
taskdata->td_taskgroup = taskgroup->parent;
Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp Wed May 1 10:54:01 2019
@@ -0,0 +1,99 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+// extern void* __kmpc_task_reduction_modifier_init(void *loc, int gtid, int
+// is_ws, int num, void* data);
+extern void *__kmpc_taskred_modifier_init(void *loc, int gtid, int is_ws,
+ int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+ void *reduce_shar; /**< shared between tasks item to reduce into */
+ void *reduce_orig; /**< original reduction item used for initialization */
+ size_t reduce_size; /**< size of data item in bytes */
+ // three compiler-generated routines (init, fini are optional):
+ void *reduce_init; /**< data initialization routine (single paramemter) */
+ void *reduce_fini; /**< data finalization routine */
+ void *reduce_comb; /**< data combiner routine */
+ unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+ int var = INIT;
+ int *p_var_orig = &var;
+ omp_set_dynamic(0);
+ omp_set_num_threads(NT);
+// #pragma omp parallel reduction(task,+:var)
+#pragma omp parallel reduction(+ : var) shared(p_var_orig)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ void *tg; // pointer to taskgroup (optional)
+ red_input_t r_var;
+ r_var.reduce_shar = &var;
+ r_var.reduce_orig =
+ p_var_orig; // not used in this test but illustrates codegen
+ r_var.reduce_size = sizeof(var);
+ r_var.reduce_init = NULL;
+ r_var.reduce_fini = NULL;
+ r_var.reduce_comb = (void *)&i_comb;
+ tg = __kmpc_taskred_modifier_init(
+ NULL, // ident_t loc;
+ gtid,
+ 0, // 1 - worksharing construct, 0 - parallel
+ 1, // number of reduction objects
+ &r_var // related data
+ );
+ var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ }
+ __kmpc_task_reduction_modifier_fini(NULL, gtid, 0);
+ }
+ if (var == INIT + NT * 3 - 1) {
+ printf("passed\n");
+ return 0;
+ } else {
+ printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+ return 1;
+ }
+}
Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp Wed May 1 10:54:01 2019
@@ -0,0 +1,93 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+extern void *__kmpc_task_reduction_modifier_init(void *loc, int gtid, int is_ws,
+ int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+ void *reduce_shar; /**< shared between tasks item to reduce into */
+ size_t reduce_size; /**< size of data item in bytes */
+ // three compiler-generated routines (init, fini are optional):
+ void *reduce_init; /**< data initialization routine (single paramemter) */
+ void *reduce_fini; /**< data finalization routine */
+ void *reduce_comb; /**< data combiner routine */
+ unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+ int var = INIT;
+ omp_set_dynamic(0);
+ omp_set_num_threads(NT);
+// #pragma omp parallel reduction(task,+:var)
+#pragma omp parallel reduction(+ : var)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ void *tg; // pointer to taskgroup (optional)
+ red_input_t r_var;
+ r_var.reduce_shar = &var;
+ r_var.reduce_size = sizeof(var);
+ r_var.reduce_init = NULL;
+ r_var.reduce_fini = NULL;
+ r_var.reduce_comb = (void *)&i_comb;
+ tg = __kmpc_task_reduction_modifier_init(
+ NULL, // ident_t loc;
+ gtid,
+ 0, // 1 - worksharing construct, 0 - parallel
+ 1, // number of reduction objects
+ &r_var // related data
+ );
+ var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ }
+ __kmpc_task_reduction_modifier_fini(NULL, gtid, 0);
+ }
+ if (var == INIT + NT * 3 - 1) {
+ printf("passed\n");
+ return 0;
+ } else {
+ printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+ return 1;
+ }
+}
Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp Wed May 1 10:54:01 2019
@@ -0,0 +1,114 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+// extern void* __kmpc_task_reduction_modifier_init(void *loc, int gtid, int
+// flags, int num, void* data);
+extern void *__kmpc_taskred_modifier_init(void *loc, int gtid, int is_ws,
+ int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+ void *reduce_shar; /**< shared between tasks item to reduce into */
+ void *reduce_orig; /**< original reduction item used for initialization */
+ size_t reduce_size; /**< size of data item in bytes */
+ // three compiler-generated routines (init, fini are optional):
+ void *reduce_init; /**< data initialization routine (single paramemter) */
+ void *reduce_fini; /**< data finalization routine */
+ void *reduce_comb; /**< data combiner routine */
+ unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+ int var = INIT;
+ int *p_var_orig = &var;
+ int i;
+ omp_set_dynamic(0);
+ omp_set_num_threads(NT);
+#pragma omp parallel private(i) shared(p_var_orig)
+// #pragma omp for reduction(task,+:var)
+#pragma omp for reduction(+ : var)
+ for (i = 0; i < NT; ++i) // single iteration per thread
+ {
+ // generated code, which actually should be placed before
+ // loop iterations distribution, but placed here just to show the idea,
+ // and to keep correctness the loop count is equal to number of threads
+ int gtid = __kmpc_global_thread_num(NULL);
+ void *tg; // pointer to taskgroup (optional)
+ red_input_t r_var;
+ r_var.reduce_shar = &var;
+ r_var.reduce_orig =
+ p_var_orig; // not used in this test but illustrates codegen
+ r_var.reduce_size = sizeof(var);
+ r_var.reduce_init = NULL;
+ r_var.reduce_fini = NULL;
+ r_var.reduce_comb = (void *)&i_comb;
+ tg = __kmpc_taskred_modifier_init(
+ NULL, // ident_t loc;
+ gtid,
+ 1, // 1 - worksharing construct, 0 - parallel
+ 1, // number of reduction objects
+ &r_var // related data
+ );
+ // end of generated code
+ var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ // emulate task reduction here because of compiler bug:
+ // it mistakenly declines to accept in_reduction because var is private
+ // outside.
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ }
+ // generated code, which actually should be placed after loop completion
+ // but before barrier and before loop reduction. It placed here just to show
+ // the idea,
+ // and to keep correctness the loop count is equal to number of threads
+ __kmpc_task_reduction_modifier_fini(NULL, gtid, 1);
+ // end of generated code
+ }
+ if (var == INIT + NT * 3 - 1) {
+ printf("passed\n");
+ return 0;
+ } else {
+ printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+ return 1;
+ }
+}
Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp Wed May 1 10:54:01 2019
@@ -0,0 +1,108 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+extern void *__kmpc_task_reduction_modifier_init(void *loc, int gtid, int is_ws,
+ int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+ void *reduce_shar; /**< shared between tasks item to reduce into */
+ size_t reduce_size; /**< size of data item in bytes */
+ // three compiler-generated routines (init, fini are optional):
+ void *reduce_init; /**< data initialization routine (single paramemter) */
+ void *reduce_fini; /**< data finalization routine */
+ void *reduce_comb; /**< data combiner routine */
+ unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+ int var = INIT;
+ int i;
+ omp_set_dynamic(0);
+ omp_set_num_threads(NT);
+#pragma omp parallel private(i)
+// #pragma omp for reduction(task,+:var)
+#pragma omp for reduction(+ : var)
+ for (i = 0; i < NT; ++i) // single iteration per thread
+ {
+ // generated code, which actually should be placed before
+ // loop iterations distribution, but placed here just to show the idea,
+ // and to keep correctness the loop count is equal to number of threads
+ int gtid = __kmpc_global_thread_num(NULL);
+ void *tg; // pointer to taskgroup (optional)
+ red_input_t r_var;
+ r_var.reduce_shar = &var;
+ r_var.reduce_size = sizeof(var);
+ r_var.reduce_init = NULL;
+ r_var.reduce_fini = NULL;
+ r_var.reduce_comb = (void *)&i_comb;
+ tg = __kmpc_task_reduction_modifier_init(
+ NULL, // ident_t loc;
+ gtid,
+ 1, // 1 - worksharing construct, 0 - parallel
+ 1, // number of reduction objects
+ &r_var // related data
+ );
+ // end of generated code
+ var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ // emulate task reduction here because of compiler bug:
+ // it mistakenly declines to accept in_reduction because var is private
+ // outside.
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+ {
+ int gtid = __kmpc_global_thread_num(NULL);
+ int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+ *p_var += 1;
+ }
+ }
+ // generated code, which actually should be placed after loop completion
+ // but before barrier and before loop reduction. It placed here just to show
+ // the idea,
+ // and to keep correctness the loop count is equal to number of threads
+ __kmpc_task_reduction_modifier_fini(NULL, gtid, 1);
+ // end of generated code
+ }
+ if (var == INIT + NT * 3 - 1) {
+ printf("passed\n");
+ return 0;
+ } else {
+ printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+ return 1;
+ }
+}
More information about the Openmp-commits
mailing list