[Openmp-commits] [openmp] r359710 - [OpenMP] Implement task modifier for reduction clause

Jonathan Peyton via Openmp-commits openmp-commits at lists.llvm.org
Wed May 1 10:54:01 PDT 2019


Author: jlpeyton
Date: Wed May  1 10:54:01 2019
New Revision: 359710

URL: http://llvm.org/viewvc/llvm-project?rev=359710&view=rev
Log:
[OpenMP] Implement task modifier for reduction clause

Implemented task modifier in two versions - one without taking into account
omp_orig variable (the omp_orig still can be processed by compiler without help
of the library, but each reduction object will need separate initializer with
global access to omp_orig), another with omp_orig variable included into
interface (single initializer can be used for multiple reduction objects of
the same type). Second version can be used when the omp_orig is not globally
accessible, or to optimize code in case of multiple reduction objects
of the same type.

Patch by Andrey Churbanov

Differential Revision: https://reviews.llvm.org/D60976

Added:
    openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
    openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
    openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
    openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
Modified:
    openmp/trunk/runtime/src/dllexports
    openmp/trunk/runtime/src/kmp.h
    openmp/trunk/runtime/src/kmp_tasking.cpp

Modified: openmp/trunk/runtime/src/dllexports
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/dllexports?rev=359710&r1=359709&r2=359710&view=diff
==============================================================================
--- openmp/trunk/runtime/src/dllexports (original)
+++ openmp/trunk/runtime/src/dllexports Wed May  1 10:54:01 2019
@@ -404,8 +404,13 @@ kmpc_set_disp_num_buffers
         __kmpc_task_reduction_get_th_data   269
 # USED FOR 4.5 __kmpc_critical_with_hint    270
         __kmpc_get_target_offload           271
-	__kmpc_omp_reg_task_with_affinity   272
-	__kmpc_pause_resource               273
+        __kmpc_omp_reg_task_with_affinity   272
+        __kmpc_pause_resource               273
+        __kmpc_task_reduction_modifier_init 274
+        __kmpc_task_reduction_modifier_fini 275
+      # __kmpc_task_allow_completion_event  276
+        __kmpc_taskred_init                 277
+        __kmpc_taskred_modifier_init        278
     %endif
 %endif
 

Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=359710&r1=359709&r2=359710&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Wed May  1 10:54:01 2019
@@ -2682,6 +2682,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_
   std::atomic<int> t_construct; // count of single directive encountered by team
   char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
 
+  // [0] - parallel / [1] - worksharing task reduction data shared by taskgroups
+  std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
+  std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
+
   // Master only
   // ---------------------------------------------------------------------------
   KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
@@ -3818,7 +3822,15 @@ KMP_EXPORT void __kmpc_taskloop(ident_t
 #endif
 #if OMP_50_ENABLED
 KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data);
+KMP_EXPORT void *__kmpc_taskred_init(int gtid, int num_data, void *data);
 KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
+KMP_EXPORT void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid,
+                                                     int is_ws, int num,
+                                                     void *data);
+KMP_EXPORT void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                              int num, void *data);
+KMP_EXPORT void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid,
+                                                    int is_ws);
 KMP_EXPORT kmp_int32 __kmpc_omp_reg_task_with_affinity(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins,
     kmp_task_affinity_info_t *affin_list);

Modified: openmp/trunk/runtime/src/kmp_tasking.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_tasking.cpp?rev=359710&r1=359709&r2=359710&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_tasking.cpp (original)
+++ openmp/trunk/runtime/src/kmp_tasking.cpp Wed May  1 10:54:01 2019
@@ -2019,49 +2019,109 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *
 
 #if OMP_50_ENABLED
 // Task Reduction implementation
+//
+// Note: initial implementation didn't take into account the possibility
+// to specify omp_orig for initializer of the UDR (user defined reduction).
+// Corrected implementation takes into account the omp_orig object.
+// Compiler is free to use old implementation if omp_orig is not specified.
 
-typedef struct kmp_task_red_flags {
-  unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects)
-  unsigned reserved31 : 31;
-} kmp_task_red_flags_t;
+/*!
+ at ingroup BASIC_TYPES
+@{
+*/
 
-// internal structure for reduction data item related info
-typedef struct kmp_task_red_data {
-  void *reduce_shar; // shared reduction item
-  size_t reduce_size; // size of data item
-  void *reduce_priv; // thread specific data
-  void *reduce_pend; // end of private data for comparison op
-  void *reduce_init; // data initialization routine
-  void *reduce_fini; // data finalization routine
-  void *reduce_comb; // data combiner routine
-  kmp_task_red_flags_t flags; // flags for additional info from compiler
-} kmp_task_red_data_t;
+/*!
+Flags for special info per task reduction item.
+*/
+typedef struct kmp_taskred_flags {
+  /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
+  unsigned lazy_priv : 1;
+  unsigned reserved31 : 31;
+} kmp_taskred_flags_t;
 
-// structure sent us by compiler - one per reduction item
+/*!
+Internal struct for reduction data item related info set up by compiler.
+*/
 typedef struct kmp_task_red_input {
-  void *reduce_shar; // shared reduction item
-  size_t reduce_size; // size of data item
-  void *reduce_init; // data initialization routine
-  void *reduce_fini; // data finalization routine
-  void *reduce_comb; // data combiner routine
-  kmp_task_red_flags_t flags; // flags for additional info from compiler
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single parameter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
 } kmp_task_red_input_t;
 
 /*!
- at ingroup TASKING
- at param gtid      Global thread ID
- at param num       Number of data items to reduce
- at param data      Array of data for reduction
- at return The taskgroup identifier
+Internal struct for reduction data item related info saved by the library.
+*/
+typedef struct kmp_taskred_data {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+  void *reduce_priv; /**< array of thread specific items */
+  void *reduce_pend; /**< end of private data for faster comparison op */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_comb; /**< data combiner routine */
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_orig; /**< original item (can be used in UDR initializer) */
+} kmp_taskred_data_t;
 
-Initialize task reduction for the taskgroup.
+/*!
+Internal struct for reduction data item related info set up by compiler.
+
+New interface: added reduce_orig field to provide omp_orig for UDR initializer.
 */
-void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+typedef struct kmp_taskred_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+} kmp_taskred_input_t;
+/*!
+@}
+*/
+
+template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
+template <>
+void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                             kmp_task_red_input_t &src) {
+  item.reduce_orig = NULL;
+}
+template <>
+void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                            kmp_taskred_input_t &src) {
+  if (src.reduce_orig != NULL) {
+    item.reduce_orig = src.reduce_orig;
+  } else {
+    item.reduce_orig = src.reduce_shar;
+  } // non-NULL reduce_orig means new interface used
+}
+
+template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
+template <>
+void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                           int offset) {
+  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
+}
+template <>
+void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                          int offset) {
+  ((void (*)(void *, void *))item.reduce_init)(
+      (char *)(item.reduce_priv) + offset, item.reduce_orig);
+}
+
+template <typename T>
+void *__kmp_task_reduction_init(int gtid, int num, T *data) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
   kmp_int32 nth = thread->th.th_team_nproc;
-  kmp_task_red_input_t *input = (kmp_task_red_input_t *)data;
-  kmp_task_red_data_t *arr;
+  kmp_taskred_data_t *arr;
 
   // check input data just in case
   KMP_ASSERT(tg != NULL);
@@ -2074,33 +2134,34 @@ void *__kmpc_task_reduction_init(int gti
   }
   KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
                 gtid, tg, num));
-  arr = (kmp_task_red_data_t *)__kmp_thread_malloc(
-      thread, num * sizeof(kmp_task_red_data_t));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thread, num * sizeof(kmp_taskred_data_t));
   for (int i = 0; i < num; ++i) {
-    void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init);
-    size_t size = input[i].reduce_size - 1;
+    size_t size = data[i].reduce_size - 1;
     // round the size up to cache line per thread-specific item
     size += CACHE_LINE - size % CACHE_LINE;
-    KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory
-    arr[i].reduce_shar = input[i].reduce_shar;
+    KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
+    arr[i].reduce_shar = data[i].reduce_shar;
     arr[i].reduce_size = size;
-    arr[i].reduce_init = input[i].reduce_init;
-    arr[i].reduce_fini = input[i].reduce_fini;
-    arr[i].reduce_comb = input[i].reduce_comb;
-    arr[i].flags = input[i].flags;
-    if (!input[i].flags.lazy_priv) {
+    arr[i].flags = data[i].flags;
+    arr[i].reduce_comb = data[i].reduce_comb;
+    arr[i].reduce_init = data[i].reduce_init;
+    arr[i].reduce_fini = data[i].reduce_fini;
+    __kmp_assign_orig<T>(arr[i], data[i]);
+    if (!arr[i].flags.lazy_priv) {
       // allocate cache-line aligned block and fill it with zeros
       arr[i].reduce_priv = __kmp_allocate(nth * size);
       arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
-      if (f_init != NULL) {
-        // initialize thread-specific items
+      if (arr[i].reduce_init != NULL) {
+        // initialize all thread-specific items
         for (int j = 0; j < nth; ++j) {
-          f_init((char *)(arr[i].reduce_priv) + j * size);
+          __kmp_call_init<T>(arr[i], j * size);
         }
       }
     } else {
       // only allocate space for pointers now,
-      // objects will be lazily allocated/initialized once requested
+      // objects will be lazily allocated/initialized if/when requested
+      // note that __kmp_allocate zeroes the allocated memory
       arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
     }
   }
@@ -2111,6 +2172,59 @@ void *__kmpc_task_reduction_init(int gti
 
 /*!
 @ingroup TASKING
+ at param gtid      Global thread ID
+ at param num       Number of data items to reduce
+ at param data      Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
+}
+
+/*!
+ at ingroup TASKING
+ at param gtid      Global thread ID
+ at param num       Number of data items to reduce
+ at param data      Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_init(int gtid, int num, void *data) {
+  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
+}
+
+// Copy task reduction data (except for shared pointers).
+template <typename T>
+void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
+                                    kmp_taskgroup_t *tg, void *reduce_data) {
+  kmp_taskred_data_t *arr;
+  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
+                " from data %p\n",
+                thr, tg, reduce_data));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thr, num * sizeof(kmp_taskred_data_t));
+  // threads will share private copies, thunk routines, sizes, flags, etc.:
+  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
+  for (int i = 0; i < num; ++i) {
+    arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
+  }
+  tg->reduce_data = (void *)arr;
+  tg->reduce_num_data = num;
+}
+
+/*!
+ at ingroup TASKING
 @param gtid    Global thread ID
 @param tskgrp  The taskgroup ID (optional)
 @param data    Shared location of the item
@@ -2128,7 +2242,7 @@ void *__kmpc_task_reduction_get_th_data(
   if (tg == NULL)
     tg = thread->th.th_current_task->td_taskgroup;
   KMP_ASSERT(tg != NULL);
-  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data);
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
   kmp_int32 num = tg->reduce_num_data;
   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
 
@@ -2152,17 +2266,21 @@ void *__kmpc_task_reduction_get_th_data(
       found:
         if (p_priv[tid] == NULL) {
           // allocate thread specific object lazily
-          void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init);
           p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
-          if (f_init != NULL) {
-            f_init(p_priv[tid]);
+          if (arr[i].reduce_init != NULL) {
+            if (arr[i].reduce_orig != NULL) { // new interface
+              ((void (*)(void *, void *))arr[i].reduce_init)(
+                  p_priv[tid], arr[i].reduce_orig);
+            } else { // old interface (single parameter)
+              ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
+            }
           }
         }
         return p_priv[tid];
       }
     }
     tg = tg->parent;
-    arr = (kmp_task_red_data_t *)(tg->reduce_data);
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
     num = tg->reduce_num_data;
   }
   KMP_ASSERT2(0, "Unknown task reduction item");
@@ -2174,7 +2292,7 @@ void *__kmpc_task_reduction_get_th_data(
 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
   kmp_int32 nth = th->th.th_team_nproc;
   KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
-  kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data;
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
   kmp_int32 num = tg->reduce_num_data;
   for (int i = 0; i < num; ++i) {
     void *sh_data = arr[i].reduce_shar;
@@ -2207,6 +2325,111 @@ static void __kmp_task_reduction_fini(km
   tg->reduce_data = NULL;
   tg->reduce_num_data = 0;
 }
+
+// Cleanup task reduction data for parallel or worksharing,
+// do not touch task private data other threads still working with.
+// Called from __kmpc_end_taskgroup()
+static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
+  __kmp_thread_free(th, tg->reduce_data);
+  tg->reduce_data = NULL;
+  tg->reduce_num_data = 0;
+}
+
+template <typename T>
+void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                         int num, T *data) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_int32 nth = thr->th.th_team_nproc;
+  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
+  if (nth == 1) {
+    KA_TRACE(10,
+             ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
+              gtid, thr->th.th_current_task->td_taskgroup));
+    return (void *)thr->th.th_current_task->td_taskgroup;
+  }
+  kmp_team_t *team = thr->th.th_team;
+  void *reduce_data;
+  kmp_taskgroup_t *tg;
+  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
+  if (reduce_data == NULL &&
+      __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
+                                 (void *)1)) {
+    // single thread enters this block to initialize common reduction data
+    KMP_DEBUG_ASSERT(reduce_data == NULL);
+    // first initialize own data, then make a copy other threads can use
+    tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
+    reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
+    KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
+    // fini counters should be 0 at this point
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
+  } else {
+    while (
+        (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
+        (void *)1) { // wait for task reduction initialization
+      KMP_CPU_PAUSE();
+    }
+    KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
+    tg = thr->th.th_current_task->td_taskgroup;
+    __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
+  }
+  return tg;
+}
+
+/*!
+ at ingroup TASKING
+ at param loc       Source location info
+ at param gtid      Global thread ID
+ at param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+ at param num       Number of data items to reduce
+ at param data      Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                          int num, void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_task_red_input_t *)data);
+}
+
+/*!
+ at ingroup TASKING
+ at param loc       Source location info
+ at param gtid      Global thread ID
+ at param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+ at param num       Number of data items to reduce
+ at param data      Array of data for reduction
+ at return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
+                                   void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_taskred_input_t *)data);
+}
+
+/*!
+ at ingroup TASKING
+ at param loc       Source location info
+ at param gtid      Global thread ID
+ at param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+
+Finalize task reduction for a parallel or worksharing.
+*/
+void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
+  __kmpc_end_taskgroup(loc, gtid);
+}
 #endif
 
 #if OMP_40_ENABLED
@@ -2326,8 +2549,54 @@ void __kmpc_end_taskgroup(ident_t *loc,
   KMP_DEBUG_ASSERT(taskgroup->count == 0);
 
 #if OMP_50_ENABLED
-  if (taskgroup->reduce_data != NULL) // need to reduce?
-    __kmp_task_reduction_fini(thread, taskgroup);
+  if (taskgroup->reduce_data != NULL) { // need to reduce?
+    int cnt;
+    void *reduce_data;
+    kmp_team_t *t = thread->th.th_team;
+    kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
+    // check if <priv> data of the first reduction variable shared for the team
+    void *priv0 = arr[0].reduce_priv;
+    if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
+        ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on parallel
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        // finalize task reduction:
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in the team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
+                   NULL &&
+               ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on worksharing
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else {
+      // finishing task reduction on taskgroup
+      __kmp_task_reduction_fini(thread, taskgroup);
+    }
+  }
 #endif
   // Restore parent taskgroup for the current task
   taskdata->td_taskgroup = taskgroup->parent;

Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp Wed May  1 10:54:01 2019
@@ -0,0 +1,99 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+// extern void* __kmpc_task_reduction_modifier_init(void *loc, int gtid, int
+// is_ws, int num, void* data);
+extern void *__kmpc_taskred_modifier_init(void *loc, int gtid, int is_ws,
+                                          int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  int *p_var_orig = &var;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+//  #pragma omp parallel reduction(task,+:var)
+#pragma omp parallel reduction(+ : var) shared(p_var_orig)
+  {
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_orig =
+        p_var_orig; // not used in this test but illustrates codegen
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_taskred_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        0, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 0);
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}

Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp Wed May  1 10:54:01 2019
@@ -0,0 +1,93 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+extern void *__kmpc_task_reduction_modifier_init(void *loc, int gtid, int is_ws,
+                                                 int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+//  #pragma omp parallel reduction(task,+:var)
+#pragma omp parallel reduction(+ : var)
+  {
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_task_reduction_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        0, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 0);
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}

Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp Wed May  1 10:54:01 2019
@@ -0,0 +1,114 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+// extern void* __kmpc_task_reduction_modifier_init(void *loc, int gtid, int
+// flags, int num, void* data);
+extern void *__kmpc_taskred_modifier_init(void *loc, int gtid, int is_ws,
+                                          int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  int *p_var_orig = &var;
+  int i;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+#pragma omp parallel private(i) shared(p_var_orig)
+//  #pragma omp for reduction(task,+:var)
+#pragma omp for reduction(+ : var)
+  for (i = 0; i < NT; ++i) // single iteration per thread
+  {
+    // generated code, which actually should be placed before
+    // loop iterations distribution, but placed here just to show the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_orig =
+        p_var_orig; // not used in this test but illustrates codegen
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_taskred_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        1, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    // end of generated code
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      // emulate task reduction here because of compiler bug:
+      // it mistakenly declines to accept in_reduction because var is private
+      // outside.
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    // generated code, which actually should be placed after loop completion
+    // but before barrier and before loop reduction. It placed here just to show
+    // the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 1);
+    // end of generated code
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}

Added: openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp?rev=359710&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp (added)
+++ openmp/trunk/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp Wed May  1 10:54:01 2019
@@ -0,0 +1,108 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+extern void *__kmpc_task_reduction_modifier_init(void *loc, int gtid, int is_ws,
+                                                 int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  int i;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+#pragma omp parallel private(i)
+//  #pragma omp for reduction(task,+:var)
+#pragma omp for reduction(+ : var)
+  for (i = 0; i < NT; ++i) // single iteration per thread
+  {
+    // generated code, which actually should be placed before
+    // loop iterations distribution, but placed here just to show the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_task_reduction_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        1, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    // end of generated code
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      // emulate task reduction here because of compiler bug:
+      // it mistakenly declines to accept in_reduction because var is private
+      // outside.
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    // generated code, which actually should be placed after loop completion
+    // but before barrier and before loop reduction. It placed here just to show
+    // the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 1);
+    // end of generated code
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}




More information about the Openmp-commits mailing list