[Openmp-commits] [openmp] r308338 - Add recursive task scheduling strategy to taskloop implementation

Jonathan Peyton via Openmp-commits openmp-commits at lists.llvm.org
Tue Jul 18 11:50:13 PDT 2017


Author: jlpeyton
Date: Tue Jul 18 11:50:13 2017
New Revision: 308338

URL: http://llvm.org/viewvc/llvm-project?rev=308338&view=rev
Log:
Add recursive task scheduling strategy to taskloop implementation

Summary:
Taskloop implementation is extended by using recursive task scheduling.
Envirable KMP_TASKLOOP_MIN_TASKS added as a manual threshold for the user
to switch from recursive to linear tasks scheduling.

Details:
* The calculations for the loop parameters are moved from __kmp_taskloop_linear
  upper level
* Initial calculation is done in the __kmpc_taskloop, further range splitting
  is done in the __kmp_taskloop_recur.
* Added threshold to switch from recursive to linear tasks scheduling;
* One half of split range is scheduled as an internal task which just moves
  sub-range parameters to the stealing thread that continues recursive
  scheduling (if number of tasks still enough), the other half is processed
  recursively;
* Internal task duplication routine fixed to assign parent task, that was not
  needed when all tasks were scheduled by same thread, but is needed now.

Patch by Andrey Churbanov

Differential Revision: https://reviews.llvm.org/D35273

Added:
    openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c
    openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c
Modified:
    openmp/trunk/runtime/src/kmp.h
    openmp/trunk/runtime/src/kmp_global.cpp
    openmp/trunk/runtime/src/kmp_settings.cpp
    openmp/trunk/runtime/src/kmp_tasking.cpp
    openmp/trunk/runtime/test/tasking/kmp_taskloop.c

Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=308338&r1=308337&r2=308338&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Tue Jul 18 11:50:13 2017
@@ -2050,8 +2050,10 @@ extern kmp_int32 __kmp_default_device; /
 // specified, defaults to 0 otherwise
 #endif
 #if OMP_45_ENABLED
-extern kmp_int32 __kmp_max_task_priority; // Set via OMP_MAX_TASK_PRIORITY if
-// specified, defaults to 0 otherwise
+// Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise
+extern kmp_int32 __kmp_max_task_priority;
+// Set via KMP_TASKLOOP_MIN_TASKS if specified, defaults to 0 otherwise
+extern kmp_uint64 __kmp_taskloop_min_tasks;
 #endif
 
 /* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with

Modified: openmp/trunk/runtime/src/kmp_global.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_global.cpp?rev=308338&r1=308337&r2=308338&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_global.cpp (original)
+++ openmp/trunk/runtime/src/kmp_global.cpp Tue Jul 18 11:50:13 2017
@@ -301,6 +301,7 @@ kmp_int32 __kmp_default_device = 0;
 kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
 #if OMP_45_ENABLED
 kmp_int32 __kmp_max_task_priority = 0;
+kmp_uint64 __kmp_taskloop_min_tasks = 0;
 #endif
 
 /* This check ensures that the compiler is passing the correct data type for the

Modified: openmp/trunk/runtime/src/kmp_settings.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_settings.cpp?rev=308338&r1=308337&r2=308338&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_settings.cpp (original)
+++ openmp/trunk/runtime/src/kmp_settings.cpp Tue Jul 18 11:50:13 2017
@@ -1167,6 +1167,20 @@ static void __kmp_stg_print_max_task_pri
                                               char const *name, void *data) {
   __kmp_stg_print_int(buffer, name, __kmp_max_task_priority);
 } // __kmp_stg_print_max_task_priority
+
+// KMP_TASKLOOP_MIN_TASKS
+// taskloop threashold to switch from recursive to linear tasks creation
+static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
+                                              char const *value, void *data) {
+  int tmp;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &tmp);
+  __kmp_taskloop_min_tasks = tmp;
+} // __kmp_stg_parse_taskloop_min_tasks
+
+static void __kmp_stg_print_taskloop_min_tasks(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_taskloop_min_tasks);
+} // __kmp_stg_print_taskloop_min_tasks
 #endif // OMP_45_ENABLED
 
 // -----------------------------------------------------------------------------
@@ -4370,6 +4384,8 @@ static kmp_setting_t __kmp_stg_table[] =
 #if OMP_45_ENABLED
     {"OMP_MAX_TASK_PRIORITY", __kmp_stg_parse_max_task_priority,
      __kmp_stg_print_max_task_priority, NULL, 0, 0},
+    {"KMP_TASKLOOP_MIN_TASKS", __kmp_stg_parse_taskloop_min_tasks,
+     __kmp_stg_print_taskloop_min_tasks, NULL, 0, 0},
 #endif
     {"OMP_THREAD_LIMIT", __kmp_stg_parse_all_threads,
      __kmp_stg_print_all_threads, NULL, 0, 0},

Modified: openmp/trunk/runtime/src/kmp_tasking.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_tasking.cpp?rev=308338&r1=308337&r2=308338&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_tasking.cpp (original)
+++ openmp/trunk/runtime/src/kmp_tasking.cpp Tue Jul 18 11:50:13 2017
@@ -3237,6 +3237,7 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_inf
                      0);
   }
   taskdata->td_alloc_thread = thread;
+  taskdata->td_parent = parent_task;
   taskdata->td_taskgroup =
       parent_task
           ->td_taskgroup; // task inherits the taskgroup from the parent task
@@ -3263,32 +3264,37 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_inf
   return task;
 }
 
-// Routine optionally generated by th ecompiler for setting the lastprivate flag
+// Routine optionally generated by the compiler for setting the lastprivate flag
 // and calling needed constructors for private/firstprivate objects
 // (used to form taskloop tasks from pattern task)
+// Parameters: dest task, src task, lastprivate flag.
 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
 
 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
 //
 // loc       Source location information
 // gtid      Global thread ID
-// task      Task with whole loop iteration range
-// lb        Pointer to loop lower bound
-// ub        Pointer to loop upper bound
+// task      Pattern task, exposes the loop iteration range
+// lb        Pointer to loop lower bound in task structure
+// ub        Pointer to loop upper bound in task structure
 // st        Loop stride
-// sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
-// grainsize Schedule value if specified
+// ub_glob   Global upper bound (used for lastprivate check)
+// num_tasks Number of tasks to execute
+// grainsize Number of loop iterations per task
+// extras    Number of chunks with grainsize+1 iterations
+// tc        Iterations count
 // task_dup  Tasks duplication routine
 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
-                           int sched, kmp_uint64 grainsize, void *task_dup) {
+                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                           kmp_uint64 grainsize, kmp_uint64 extras,
+                           kmp_uint64 tc, void *task_dup) {
   KMP_COUNT_BLOCK(OMP_TASKLOOP);
   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
-  kmp_uint64 tc;
   kmp_uint64 lower = *lb; // compiler provides global bounds here
   kmp_uint64 upper = *ub;
-  kmp_uint64 i, num_tasks = 0, extras = 0;
+  kmp_uint64 i;
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
   kmp_task_t *next_task;
@@ -3298,6 +3304,254 @@ void __kmp_taskloop_linear(ident_t *loc,
   size_t upper_offset =
       (char *)ub - (char *)task; // remember offset of ub in the task structure
 
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
+                "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", gtid, num_tasks,
+                grainsize, extras, lower, upper, ub_glob, st, task_dup));
+
+  // Launch num_tasks tasks, assign grainsize iterations each task
+  for (i = 0; i < num_tasks; ++i) {
+    kmp_uint64 chunk_minus_1;
+    if (extras == 0) {
+      chunk_minus_1 = grainsize - 1;
+    } else {
+      chunk_minus_1 = grainsize;
+      --extras; // first extras iterations get bigger chunk (grainsize+1)
+    }
+    upper = lower + st * chunk_minus_1;
+    if (i == num_tasks - 1) {
+      // schedule the last task, set lastprivate flag if needed
+      if (st == 1) { // most common case
+        KMP_DEBUG_ASSERT(upper == *ub);
+        if (upper == ub_glob)
+          lastpriv = 1;
+      } else if (st > 0) { // positive loop stride
+        KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
+        if ((kmp_uint64)st > ub_glob - upper)
+          lastpriv = 1;
+      } else {  // negative loop stride
+        KMP_DEBUG_ASSERT(upper + st < *ub);
+        if (upper - ub_glob < (kmp_uint64)(-st))
+          lastpriv = 1;
+      }
+    }
+    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+    // adjust task-specific bounds
+    *(kmp_uint64 *)((char *)next_task + lower_offset) = lower;
+    *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
+    if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
+      ptask_dup(next_task, task, lastpriv);
+    KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, "
+                  "upper %lld (offsets %p %p)\n",
+                  gtid, next_task, lower, upper, lower_offset, upper_offset));
+    __kmp_omp_task(gtid, next_task, true); // schedule new task
+    lower = upper + st; // adjust lower bound for the next iteration
+  }
+  // free the pattern task and exit
+  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
+  // do not execute the pattern task, just do internal bookkeeping
+  __kmp_task_finish(gtid, task, current_task);
+}
+
+// Structure to keep taskloop parameters for auxiliary task
+// kept in the shareds of the task structure.
+typedef struct __taskloop_params {
+  kmp_task_t *task;
+  kmp_uint64 *lb;
+  kmp_uint64 *ub;
+  void *task_dup;
+  kmp_int64 st;
+  kmp_uint64 ub_glob;
+  kmp_uint64 num_tasks;
+  kmp_uint64 grainsize;
+  kmp_uint64 extras;
+  kmp_uint64 tc;
+  kmp_uint64 num_t_min;
+} __taskloop_params_t;
+
+void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
+                          kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
+                          kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
+                          void *);
+
+// Execute part of the the taskloop submitted as a task.
+int __kmp_taskloop_task(int gtid, void *ptask) {
+  __taskloop_params_t *p = (__taskloop_params_t*)((kmp_task_t*)ptask)->shareds;
+  kmp_task_t *task = p->task;
+  kmp_uint64 *lb = p->lb;
+  kmp_uint64 *ub = p->ub;
+  void *task_dup = p->task_dup;
+//  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_int64 st = p->st;
+  kmp_uint64 ub_glob = p->ub_glob;
+  kmp_uint64 num_tasks = p->num_tasks;
+  kmp_uint64 grainsize = p->grainsize;
+  kmp_uint64 extras = p->extras;
+  kmp_uint64 tc = p->tc;
+  kmp_uint64 num_t_min = p->num_t_min;
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
+                num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
+#endif
+  KMP_DEBUG_ASSERT(num_tasks*2+1 > num_t_min);
+  if (num_tasks > num_t_min)
+    __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_t_min, task_dup);
+  else
+    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
+
+  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
+  return 0;
+}
+
+// Schedule part of the the taskloop as a task,
+// execute the rest of the the taskloop.
+//
+// loc       Source location information
+// gtid      Global thread ID
+// task      Pattern task, exposes the loop iteration range
+// lb        Pointer to loop lower bound in task structure
+// ub        Pointer to loop upper bound in task structure
+// st        Loop stride
+// ub_glob   Global upper bound (used for lastprivate check)
+// num_tasks Number of tasks to execute
+// grainsize Number of loop iterations per task
+// extras    Number of chunks with grainsize+1 iterations
+// tc        Iterations count
+// num_t_min Threashold to launch tasks recursively
+// task_dup  Tasks duplication routine
+void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
+                          kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                          kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                          kmp_uint64 grainsize, kmp_uint64 extras,
+                          kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) {
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
+  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
+                num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
+#endif
+  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_uint64 lower = *lb;
+  kmp_uint64 upper = *ub;
+  kmp_info_t *thread = __kmp_threads[gtid];
+//  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  kmp_task_t *next_task;
+  kmp_int32 lastpriv = 0;
+  size_t lower_offset =
+      (char *)lb - (char *)task; // remember offset of lb in the task structure
+  size_t upper_offset =
+      (char *)ub - (char *)task; // remember offset of ub in the task structure
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+
+  // split the loop in two halves
+  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
+  kmp_uint64 gr_size0 = grainsize;
+  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
+  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
+  if (n_tsk0 <= extras) {
+    gr_size0++; // integrate extras into grainsize
+    ext0 = 0; // no extra iters in 1st half
+    ext1 = extras - n_tsk0; // remaining extras
+    tc0 = gr_size0 * n_tsk0;
+    tc1 = tc - tc0;
+  } else { // n_tsk0 > extras
+    ext1 = 0; // no extra iters in 2nd half
+    ext0 = extras;
+    tc1 = grainsize * n_tsk1;
+    tc0 = tc - tc1;
+  }
+  ub0 = lower + st * (tc0 - 1);
+  lb1 = ub0 + st;
+
+  // create pattern task for 2nd half of the loop
+  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+  // adjust lower bound (upper bound is not changed) for the 2nd half
+  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
+  if (ptask_dup != NULL) // construct fistprivates, etc.
+    ptask_dup(next_task, task, 0);
+  *ub = ub0; // adjust upper bound for the 1st half
+
+  // create auxiliary task for 2nd half of the loop
+  kmp_task_t *new_task =
+      __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void*),
+                            sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  __taskloop_params_t * p = (__taskloop_params_t *)new_task->shareds;
+  p->task = next_task;
+  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
+  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
+  p->task_dup = task_dup;
+  p->st = st;
+  p->ub_glob = ub_glob;
+  p->num_tasks = n_tsk1;
+  p->grainsize = grainsize;
+  p->extras = ext1;
+  p->tc = tc1;
+  p->num_t_min = num_t_min;
+  __kmp_omp_task(gtid, new_task, true); // schedule new task
+
+  // execute the 1st half of current subrange
+  if (n_tsk0 > num_t_min)
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                         gr_size0, ext0, tc0, num_t_min, task_dup);
+  else
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                          gr_size0, ext0, tc0, task_dup);
+
+  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
+}
+
+/*!
+ at ingroup TASKING
+ at param loc       Source location information
+ at param gtid      Global thread ID
+ at param task      Task structure
+ at param if_val    Value of the if clause
+ at param lb        Pointer to loop lower bound in task structure
+ at param ub        Pointer to loop upper bound in task structure
+ at param st        Loop stride
+ at param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+ at param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+ at param grainsize Schedule value if specified
+ at param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
+                     int sched, kmp_uint64 grainsize, void *task_dup) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+
+  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
+                "grain %llu(%d), dup %p\n", gtid, taskdata, *lb, *ub, st,
+                grainsize, sched, task_dup));
+
+  if (nogroup == 0)
+    __kmpc_taskgroup(loc, gtid);
+
+  // =========================================================================
+  // calculate loop parameters
+  kmp_uint64 tc;
+  kmp_uint64 lower = *lb; // compiler provides global bounds here
+  kmp_uint64 upper = *ub;
+  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
+  kmp_uint64 num_tasks = 0, extras = 0;
+  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
   // compute trip count
   if (st == 1) { // most common case
     tc = upper - lower + 1;
@@ -3314,6 +3568,10 @@ void __kmp_taskloop_linear(ident_t *loc,
     __kmp_task_finish(gtid, task, current_task);
     return;
   }
+  if (num_tasks_min == 0)
+    // TODO: can we choose better default heuristic?
+    num_tasks_min = KMP_MIN(thread->th.th_team_nproc * 10,
+                            INITIAL_TASK_DEQUE_SIZE);
 
   // compute num_tasks/grainsize based on the input provided
   switch (sched) {
@@ -3338,9 +3596,8 @@ void __kmp_taskloop_linear(ident_t *loc,
       extras = 0;
     } else {
       num_tasks = tc / grainsize;
-      grainsize =
-          tc /
-          num_tasks; // adjust grainsize for balanced distribution of iterations
+      // adjust grainsize for balanced distribution of iterations
+      grainsize = tc / num_tasks;
       extras = tc % num_tasks;
     }
     break;
@@ -3350,95 +3607,32 @@ void __kmp_taskloop_linear(ident_t *loc,
   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
-  KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize "
-                "%lld, extras %lld\n",
-                gtid, num_tasks, grainsize, extras));
-
-  // Main loop, launch num_tasks tasks, assign grainsize iterations each task
-  for (i = 0; i < num_tasks; ++i) {
-    kmp_uint64 chunk_minus_1;
-    if (extras == 0) {
-      chunk_minus_1 = grainsize - 1;
-    } else {
-      chunk_minus_1 = grainsize;
-      --extras; // first extras iterations get bigger chunk (grainsize+1)
-    }
-    upper = lower + st * chunk_minus_1;
-    if (i == num_tasks - 1) {
-      // schedule the last task, set lastprivate flag
-      lastpriv = 1;
-#if KMP_DEBUG
-      if (st == 1)
-        KMP_DEBUG_ASSERT(upper == *ub);
-      else if (st > 0)
-        KMP_DEBUG_ASSERT(upper + st > *ub);
-      else
-        KMP_DEBUG_ASSERT(upper + st < *ub);
-#endif
-    }
-    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
-    *(kmp_uint64 *)((char *)next_task + lower_offset) =
-        lower; // adjust task-specific bounds
-    *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
-    if (ptask_dup != NULL)
-      ptask_dup(next_task, task,
-                lastpriv); // set lastprivate flag, construct fistprivates, etc.
-    KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper "
-                  "%lld (offsets %p %p)\n",
-                  gtid, next_task, lower, upper, lower_offset, upper_offset));
-    __kmp_omp_task(gtid, next_task, true); // schedule new task
-    lower = upper + st; // adjust lower bound for the next iteration
-  }
-  // free the pattern task and exit
-  __kmp_task_start(gtid, task, current_task);
-  // do not execute the pattern task, just do bookkeeping
-  __kmp_task_finish(gtid, task, current_task);
-}
-
-/*!
- at ingroup TASKING
- at param loc       Source location information
- at param gtid      Global thread ID
- at param task      Task structure
- at param if_val    Value of the if clause
- at param lb        Pointer to loop lower bound
- at param ub        Pointer to loop upper bound
- at param st        Loop stride
- at param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
- at param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
- at param grainsize Schedule value if specified
- at param task_dup  Tasks duplication routine
-
-Execute the taskloop construct.
-*/
-void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
-                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
-                     int sched, kmp_uint64 grainsize, void *task_dup) {
-  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-  KMP_DEBUG_ASSERT(task != NULL);
-
-  KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub "
-                "%lld st %lld, grain %llu(%d)\n",
-                gtid, taskdata, *lb, *ub, st, grainsize, sched));
+  // =========================================================================
 
   // check if clause value first
   if (if_val == 0) { // if(0) specified, mark task as serial
     taskdata->td_flags.task_serial = 1;
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
-  }
-  if (nogroup == 0) {
-    __kmpc_taskgroup(loc, gtid);
-  }
-
-  if (1 /* AC: use some heuristic here to choose task scheduling method */) {
-    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize,
-                          task_dup);
+    // always start serial tasks linearly
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
+  } else if (num_tasks > num_tasks_min) {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
+                  num_tasks_min, grainsize, extras));
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_tasks_min, task_dup);
+  } else {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
+                  num_tasks_min, grainsize, extras));
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
   }
 
-  if (nogroup == 0) {
+  if (nogroup == 0)
     __kmpc_end_taskgroup(loc, gtid);
-  }
-  KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
 
 #endif

Modified: openmp/trunk/runtime/test/tasking/kmp_taskloop.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/kmp_taskloop.c?rev=308338&r1=308337&r2=308338&view=diff
==============================================================================
--- openmp/trunk/runtime/test/tasking/kmp_taskloop.c (original)
+++ openmp/trunk/runtime/test/tasking/kmp_taskloop.c Tue Jul 18 11:50:13 2017
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
 #include <stdio.h>
 #include <omp.h>
 #include "omp_my_sleep.h"

Added: openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c?rev=308338&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c (added)
+++ openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c Tue Jul 18 11:50:13 2017
@@ -0,0 +1,106 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_grainsize()
+{
+  int i, grainsize, count, tmp_count, result, num_off;
+  int *tmp, *tids, *tidsArray;
+
+  tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+  tids = tidsArray;
+
+  for (grainsize = 1; grainsize < 48; ++grainsize) {
+    fprintf(stderr, "Grainsize %d\n", grainsize);
+    count = tmp_count = num_off = 0;
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      tids[i] = -1;
+    }
+
+    #pragma omp parallel shared(tids)
+    {
+      #pragma omp master
+      #pragma omp taskloop grainsize(grainsize)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      if (tids[i] == -1) {
+        fprintf(stderr, "  Iteration %d not touched!\n", i);
+        result++;
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    tmp = (int *)malloc(sizeof(int) * (count + 1));
+    tmp[0] = 1;
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tmp_count > count) {
+        printf("--------------------\nTestinternal Error: List too "
+               "small!!!\n--------------------\n");
+        break;
+      }
+      if (tids[i] != tids[i + 1]) {
+        tmp_count++;
+        tmp[tmp_count] = 1;
+      } else {
+        tmp[tmp_count]++;
+      }
+    }
+
+    // is grainsize statement working?
+    int num_tasks = CFDMAX_SIZE / grainsize;
+    int multiple1 = CFDMAX_SIZE / num_tasks;
+    int multiple2 = CFDMAX_SIZE / num_tasks + 1;
+    for (i = 0; i < count; i++) {
+      // it is possible for 2 adjacent chunks assigned to a same thread
+      if (tmp[i] % multiple1 != 0 && tmp[i] % multiple2 != 0) {
+        num_off++;
+      }
+    }
+
+    if (num_off > 1) {
+      fprintf(stderr, "  The number of bad chunks is %d\n", num_off);
+      result++;
+    } else {
+      fprintf(stderr, "  Everything ok\n");
+    }
+
+    free(tmp);
+  }
+  free(tidsArray);
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_grainsize()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}

Added: openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c?rev=308338&view=auto
==============================================================================
--- openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c (added)
+++ openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c Tue Jul 18 11:50:13 2017
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_num_tasks()
+{
+  int i;
+  int *tids;
+  int *tidsArray;
+  int count;
+  int result = 0;
+  int num_tasks;
+
+  for (num_tasks = 1; num_tasks < 120; ++num_tasks) {
+    count = 0;
+    tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+    tids = tidsArray;
+
+    #pragma omp parallel shared(tids)
+    {
+      int i;
+      #pragma omp master
+      #pragma omp taskloop num_tasks(num_tasks)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    if (count > num_tasks) {
+      fprintf(stderr, "counted too many tasks: (wanted %d, got %d)\n",
+              num_tasks, count);
+      result++;
+    }
+  }
+
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_num_tasks()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}




More information about the Openmp-commits mailing list