[Openmp-commits] [openmp] r236753 - D9306 omp 4.1 async offload support (partial): code changes
Andrey Churbanov
Andrey.Churbanov at intel.com
Thu May 7 10:41:52 PDT 2015
Author: achurbanov
Date: Thu May 7 12:41:51 2015
New Revision: 236753
URL: http://llvm.org/viewvc/llvm-project?rev=236753&view=rev
Log:
D9306 omp 4.1 async offload support (partial): code changes
Modified:
openmp/trunk/runtime/src/dllexports
openmp/trunk/runtime/src/kmp.h
openmp/trunk/runtime/src/kmp_barrier.cpp
openmp/trunk/runtime/src/kmp_csupport.c
openmp/trunk/runtime/src/kmp_runtime.c
openmp/trunk/runtime/src/kmp_taskdeps.cpp
openmp/trunk/runtime/src/kmp_tasking.c
Modified: openmp/trunk/runtime/src/dllexports
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/dllexports?rev=236753&r1=236752&r2=236753&view=diff
==============================================================================
--- openmp/trunk/runtime/src/dllexports (original)
+++ openmp/trunk/runtime/src/dllexports Thu May 7 12:41:51 2015
@@ -381,6 +381,14 @@ kmpc_set_defaults
%endif # OMP_40
%endif
+# OpenMP 4.1 entry points
+%ifndef stub
+ %ifdef OMP_41
+ __kmpc_proxy_task_completed 259
+ __kmpc_proxy_task_completed_ooo 260
+ %endif
+%endif
+
# User API entry points that have both lower- and upper- case versions for Fortran.
# Number for lowercase version is indicated. Number for uppercase is obtained by adding 1000.
# User API entry points are entry points that start with 'kmp_' or 'omp_'.
Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=236753&r1=236752&r2=236753&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Thu May 7 12:41:51 2015
@@ -51,6 +51,8 @@
#define TASK_UNTIED 0
#define TASK_EXPLICIT 1
#define TASK_IMPLICIT 0
+#define TASK_PROXY 1
+#define TASK_FULL 0
#define KMP_CANCEL_THREADS
#define KMP_THREAD_ATTR
@@ -1987,7 +1989,12 @@ typedef struct kmp_tasking_flags {
unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0 code path */
#if OMP_40_ENABLED
unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to invoke destructors from the runtime */
+#if OMP_41_ENABLED
+ unsigned proxy : 1; /* task is a proxy task (it will be executed outside the context of the RTL) */
+ unsigned reserved : 11; /* reserved for compiler use */
+#else
unsigned reserved : 12; /* reserved for compiler use */
+#endif
#else // OMP_40_ENABLED
unsigned reserved : 13; /* reserved for compiler use */
#endif // OMP_40_ENABLED
@@ -2077,6 +2084,9 @@ typedef struct kmp_base_task_team {
/* TRUE means tt_threads_data is set up and initialized */
kmp_int32 tt_nproc; /* #threads in team */
kmp_int32 tt_max_threads; /* number of entries allocated for threads_data array */
+#if OMP_41_ENABLED
+ kmp_int32 tt_found_proxy_tasks; /* Have we found proxy tasks since last barrier */
+#endif
KMP_ALIGN_CACHE
volatile kmp_uint32 tt_unfinished_threads; /* #threads still active */
@@ -3147,7 +3157,7 @@ int __kmp_execute_tasks_oncore(kmp_info_
extern void __kmp_reap_task_teams( void );
extern void __kmp_unref_task_team( kmp_task_team_t *task_team, kmp_info_t *thread );
extern void __kmp_wait_to_unref_task_teams( void );
-extern void __kmp_task_team_setup ( kmp_info_t *this_thr, kmp_team_t *team, int both );
+extern void __kmp_task_team_setup ( kmp_info_t *this_thr, kmp_team_t *team, int both, int always );
extern void __kmp_task_team_sync ( kmp_info_t *this_thr, kmp_team_t *team );
extern void __kmp_task_team_wait ( kmp_info_t *this_thr, kmp_team_t *team
#if USE_ITT_BUILD
@@ -3302,8 +3312,16 @@ KMP_EXPORT kmp_int32 __kmpc_cancellation
KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t* loc_ref, kmp_int32 gtid);
KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
+#if OMP_41_ENABLED
+
+KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask );
+KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+
+#endif
+
#endif
+
/*
* Lock interface routines (fast versions with gtid passed in)
*/
Modified: openmp/trunk/runtime/src/kmp_barrier.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_barrier.cpp?rev=236753&r1=236752&r2=236753&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_barrier.cpp (original)
+++ openmp/trunk/runtime/src/kmp_barrier.cpp Thu May 7 12:41:51 2015
@@ -1135,7 +1135,7 @@ __kmp_barrier(enum barrier_type bt, int
if (__kmp_tasking_mode != tskm_immediate_exec) {
__kmp_task_team_wait(this_thr, team
USE_ITT_BUILD_ARG(itt_sync_obj) );
- __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team
+ __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
}
@@ -1227,9 +1227,32 @@ __kmp_barrier(enum barrier_type bt, int
} else { // Team is serialized.
status = 0;
if (__kmp_tasking_mode != tskm_immediate_exec) {
+#if OMP_41_ENABLED
+ if ( this_thr->th.th_task_team != NULL ) {
+ void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+ if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+ itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+ __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+ }
+#endif
+
+ kmp_task_team_t * task_team = this_thr->th.th_task_team;
+ KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
+ __kmp_task_team_wait(this_thr, team
+ USE_ITT_BUILD_ARG(itt_sync_obj));
+ __kmp_task_team_setup(this_thr, team, 0, 0);
+
+#if USE_ITT_BUILD
+ if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+ __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+ }
+#else
// The task team should be NULL for serialized code (tasks will be executed immediately)
KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
+#endif
}
}
KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
@@ -1532,7 +1555,7 @@ __kmp_fork_barrier(int gtid, int tid)
#endif
if (__kmp_tasking_mode != tskm_immediate_exec) {
- __kmp_task_team_setup(this_thr, team, 1); // 1 indicates setup both task teams
+ __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
}
/* The master thread may have changed its blocktime between the join barrier and the
Modified: openmp/trunk/runtime/src/kmp_csupport.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_csupport.c?rev=236753&r1=236752&r2=236753&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_csupport.c (original)
+++ openmp/trunk/runtime/src/kmp_csupport.c Thu May 7 12:41:51 2015
@@ -461,6 +461,14 @@ __kmpc_end_serialized_parallel(ident_t *
this_thr = __kmp_threads[ global_tid ];
serial_team = this_thr->th.th_serial_team;
+ #if OMP_41_ENABLED
+ kmp_task_team_t * task_team = this_thr->th.th_task_team;
+
+ // we need to wait for the proxy tasks before finishing the thread
+ if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
+ __kmp_task_team_wait(this_thr, serial_team, NULL ); // is an ITT object needed here?
+ #endif
+
KMP_MB();
KMP_DEBUG_ASSERT( serial_team );
KMP_ASSERT( serial_team -> t.t_serialized );
Modified: openmp/trunk/runtime/src/kmp_runtime.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_runtime.c?rev=236753&r1=236752&r2=236753&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_runtime.c (original)
+++ openmp/trunk/runtime/src/kmp_runtime.c Thu May 7 12:41:51 2015
@@ -3940,6 +3940,16 @@ __kmp_unregister_root_current_thread( in
KMP_MB();
+#if OMP_41_ENABLED
+ kmp_info_t * thread = __kmp_threads[gtid];
+ kmp_team_t * team = thread->th.th_team;
+ kmp_task_team_t * task_team = thread->th.th_task_team;
+
+ // we need to wait for the proxy tasks before finishing the thread
+ if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
+ __kmp_task_team_wait(thread, team, NULL );
+#endif
+
__kmp_reset_root(gtid, root);
/* free up this thread slot */
Modified: openmp/trunk/runtime/src/kmp_taskdeps.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_taskdeps.cpp?rev=236753&r1=236752&r2=236753&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_taskdeps.cpp (original)
+++ openmp/trunk/runtime/src/kmp_taskdeps.cpp Thu May 7 12:41:51 2015
@@ -403,6 +403,9 @@ __kmpc_omp_task_with_deps( ident_t *loc_
kmp_taskdata_t * current_task = thread->th.th_current_task;
bool serial = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
+#if OMP_41_ENABLED
+ serial = serial && !(new_taskdata->td_flags.proxy == TASK_PROXY);
+#endif
if ( !serial && ( ndeps > 0 || ndeps_noalias > 0 )) {
/* if no dependencies have been tracked yet, create the dependence hash */
@@ -425,11 +428,20 @@ __kmpc_omp_task_with_deps( ident_t *loc_
new_taskdata ) );
return TASK_CURRENT_NOT_QUEUED;
}
+ } else {
+#if OMP_41_ENABLED
+ kmp_task_team_t * task_team = thread->th.th_task_team;
+ if ( task_team && task_team->tt.tt_found_proxy_tasks )
+ __kmpc_omp_wait_deps ( loc_ref, gtid, ndeps, dep_list, ndeps_noalias, noalias_dep_list );
+ else
+#endif
+ KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies for task (serialized)"
+ "loc=%p task=%p\n", gtid, loc_ref, new_taskdata ) );
}
KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking dependencies : "
"loc=%p task=%p, transferring to __kmpc_omp_task\n", gtid, loc_ref,
- new_taskdata ) );
+ new_taskdata ) );
return __kmpc_omp_task(loc_ref,gtid,new_task);
}
@@ -460,9 +472,15 @@ __kmpc_omp_wait_deps ( ident_t *loc_ref,
kmp_taskdata_t * current_task = thread->th.th_current_task;
// We can return immediately as:
- // - dependences are not computed in serial teams
+ // - dependences are not computed in serial teams (except if we have proxy tasks)
// - if the dephash is not yet created it means we have nothing to wait for
- if ( current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final || current_task->td_dephash == NULL ) {
+ bool ignore = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
+#if OMP_41_ENABLED
+ ignore = ignore && thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
+#endif
+ ignore = ignore || current_task->td_dephash == NULL;
+
+ if ( ignore ) {
KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking dependencies : loc=%p\n", gtid, loc_ref) );
return;
}
Modified: openmp/trunk/runtime/src/kmp_tasking.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_tasking.c?rev=236753&r1=236752&r2=236753&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_tasking.c (original)
+++ openmp/trunk/runtime/src/kmp_tasking.c Thu May 7 12:41:51 2015
@@ -32,6 +32,10 @@ static void __kmp_enable_tasking( kmp_ta
static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
static int __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
+#ifdef OMP_41_ENABLED
+static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
+#endif
+
static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
switch (((kmp_flag_64 *)flag)->get_type()) {
case flag32: __kmp_resume_32(gtid, NULL); break;
@@ -312,8 +316,19 @@ __kmp_push_task(kmp_int32 gtid, kmp_task
// Lock the deque for the task push operation
__kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
+#if OMP_41_ENABLED
+ // Need to recheck as we can get a proxy task from a thread outside of OpenMP
+ if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+ {
+ __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
+ KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
+ gtid, taskdata ) );
+ return TASK_NOT_PUSHED;
+ }
+#else
// Must have room since no thread can add tasks but calling thread
KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE );
+#endif
thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Push taskdata
// Wrap index.
@@ -780,6 +795,10 @@ __kmp_init_implicit_task( ident_t *loc_r
task->td_flags.tiedness = TASK_TIED;
task->td_flags.tasktype = TASK_IMPLICIT;
+#if OMP_41_ENABLED
+ task->td_flags.proxy = TASK_FULL;
+#endif
+
// All implicit tasks are executed immediately, not deferred
task->td_flags.task_serial = 1;
task->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
@@ -864,6 +883,40 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_
flags->final = 1;
}
+#if OMP_41_ENABLED
+ if ( flags->proxy == TASK_PROXY ) {
+ flags->tiedness = TASK_UNTIED;
+ flags->merged_if0 = 1;
+
+ /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */
+ if ( (thread->th.th_task_team) == NULL ) {
+ /* This should only happen if the team is serialized
+ setup a task team and propagate it to the thread
+ */
+ KMP_DEBUG_ASSERT(team->t.t_serialized);
+ KA_TRACE(30,("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid));
+ __kmp_task_team_setup(thread,team,0,1); // 0,1 indicates only setup the current team regardless of nthreads
+ thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
+ }
+ kmp_task_team_t * task_team = thread->th.th_task_team;
+
+ /* tasking must be enabled now as the task might not be pushed */
+ if ( !KMP_TASKING_ENABLED( task_team ) ) {
+ KA_TRACE(30,("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
+ __kmp_enable_tasking( task_team, thread );
+ kmp_int32 tid = thread->th.th_info.ds.ds_tid;
+ kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
+ // No lock needed since only owner can allocate
+ if (thread_data -> td.td_deque == NULL ) {
+ __kmp_alloc_task_deque( thread, thread_data );
+ }
+ }
+
+ if ( task_team->tt.tt_found_proxy_tasks == FALSE )
+ TCW_4(task_team -> tt.tt_found_proxy_tasks, TRUE);
+ }
+#endif
+
// Calculate shared structure offset including padding after kmp_task_t struct
// to align pointers in shared struct
shareds_offset = sizeof( kmp_taskdata_t ) + sizeof_kmp_task_t;
@@ -913,7 +966,11 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_
taskdata->td_taskwait_counter = 0;
taskdata->td_taskwait_thread = 0;
KMP_DEBUG_ASSERT( taskdata->td_parent != NULL );
- copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
+#if OMP_41_ENABLED
+ // avoid copying icvs for proxy tasks
+ if ( flags->proxy == TASK_FULL )
+#endif
+ copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
taskdata->td_flags.tiedness = flags->tiedness;
taskdata->td_flags.final = flags->final;
@@ -921,6 +978,9 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_
#if OMP_40_ENABLED
taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
#endif // OMP_40_ENABLED
+#if OMP_41_ENABLED
+ taskdata->td_flags.proxy = flags->proxy;
+#endif
taskdata->td_flags.tasktype = TASK_EXPLICIT;
// GEH - TODO: fix this to copy parent task's value of tasking_ser flag
@@ -949,8 +1009,14 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_
taskdata->td_dephash = NULL;
taskdata->td_depnode = NULL;
#endif
- // Only need to keep track of child task counts if team parallel and tasking not serialized
- if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
+
+ // Only need to keep track of child task counts if team parallel and tasking not serialized or if it is a proxy task
+#if OMP_41_ENABLED
+ if ( flags->proxy == TASK_PROXY || !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
+#else
+ if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
+#endif
+ {
KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
#if OMP_40_ENABLED
if ( parent_task->td_taskgroup )
@@ -989,9 +1055,14 @@ __kmpc_omp_task_alloc( ident_t *loc_ref,
input_flags->native = FALSE;
// __kmp_task_alloc() sets up all other runtime flags
- KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
+ KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
"sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
+#if OMP_41_ENABLED
+ input_flags->proxy ? "proxy" : "",
+#else
+ "",
+#endif
sizeof_kmp_task_t, sizeof_shareds, task_entry) );
retval = __kmp_task_alloc( loc_ref, gtid, input_flags, sizeof_kmp_task_t,
@@ -1019,6 +1090,27 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_t
KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
gtid, taskdata, current_task) );
+#if OMP_41_ENABLED
+ if ( taskdata->td_flags.proxy == TASK_PROXY &&
+ taskdata->td_flags.complete == 1)
+ {
+ // This is a proxy task that was already completed but it needs to run
+ // its bottom-half finish
+ KA_TRACE(30, ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
+ gtid, taskdata) );
+
+ __kmp_bottom_half_finish_proxy(gtid,task);
+
+ KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for proxy task %p, resuming task %p\n", gtid, taskdata, current_task) );
+
+ return;
+ }
+#endif
+
+#if OMP_41_ENABLED
+ // Proxy tasks are not handled by the runtime
+ if ( taskdata->td_flags.proxy != TASK_PROXY )
+#endif
__kmp_task_start( gtid, task, current_task );
#if OMPT_SUPPORT
@@ -1075,9 +1167,13 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_t
}
#endif
- __kmp_task_finish( gtid, task, current_task );
+#if OMP_41_ENABLED
+ // Proxy tasks are not handled by the runtime
+ if ( taskdata->td_flags.proxy != TASK_PROXY )
+#endif
+ __kmp_task_finish( gtid, task, current_task );
- KA_TRACE(30, ("__kmp_inovke_task(exit): T#%d completed task %p, resuming task %p\n",
+ KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
gtid, taskdata, current_task) );
return;
}
@@ -1140,8 +1236,11 @@ __kmp_omp_task( kmp_int32 gtid, kmp_task
/* Should we execute the new task or queue it? For now, let's just always try to
queue it. If the queue fills up, then we'll execute it. */
-
+#if OMP_41_ENABLED
+ if ( new_taskdata->td_flags.proxy == TASK_PROXY || __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
+#else
if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
+#endif
{ // Execute this task immediately
kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
if ( serialize_immediate )
@@ -1216,7 +1315,12 @@ __kmpc_omp_taskwait( ident_t *loc_ref, k
__kmp_itt_taskwait_starting( gtid, itt_sync_obj );
#endif /* USE_ITT_BUILD */
- if ( ! taskdata->td_flags.team_serial ) {
+#if OMP_41_ENABLED
+ if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
+#else
+ if ( ! taskdata->td_flags.team_serial )
+#endif
+ {
// GEH: if team serialized, avoid reading the volatile variable below.
kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
@@ -1338,7 +1442,12 @@ __kmpc_end_taskgroup( ident_t* loc, int
__kmp_itt_taskwait_starting( gtid, itt_sync_obj );
#endif /* USE_ITT_BUILD */
- if ( ! taskdata->td_flags.team_serial ) {
+#if OMP_41_ENABLED
+ if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
+#else
+ if ( ! taskdata->td_flags.team_serial )
+#endif
+ {
kmp_flag_32 flag(&(taskgroup->count), 0U);
while ( TCR_4(taskgroup->count) != 0 ) {
flag.execute_tasks(thread, gtid, FALSE, &thread_finished
@@ -1582,7 +1691,11 @@ static inline int __kmp_execute_tasks_te
nthreads = task_team -> tt.tt_nproc;
unfinished_threads = &(task_team -> tt.tt_unfinished_threads);
+#if OMP_41_ENABLED
+ KMP_DEBUG_ASSERT( nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
+#else
KMP_DEBUG_ASSERT( nthreads > 1 );
+#endif
KMP_DEBUG_ASSERT( TCR_4((int)*unfinished_threads) >= 0 );
// Choose tasks from our own work queue.
@@ -1617,7 +1730,13 @@ static inline int __kmp_execute_tasks_te
// This thread's work queue is empty. If we are in the final spin loop
// of the barrier, check and see if the termination condition is satisfied.
- if (final_spin) {
+#if OMP_41_ENABLED
+ // The work queue may be empty but there might be proxy tasks still executing
+ if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
+#else
+ if (final_spin)
+#endif
+ {
// First, decrement the #unfinished threads, if that has not already
// been done. This decrement might be to the spin location, and
// result in the termination condition being satisfied.
@@ -1639,6 +1758,12 @@ static inline int __kmp_execute_tasks_te
}
}
+#if OMP_41_ENABLED
+ // check if there are other threads to steal from, otherwise go back
+ if ( nthreads == 1 )
+ goto start;
+#endif
+
// Try to steal from the last place I stole from successfully.
tid = thread -> th.th_info.ds.ds_tid;//__kmp_tid_from_gtid( gtid );
last_stolen = threads_data[ tid ].td.td_deque_last_stolen;
@@ -1686,7 +1811,13 @@ static inline int __kmp_execute_tasks_te
// The victims's work queue is empty. If we are in the final spin loop
// of the barrier, check and see if the termination condition is satisfied.
- if (final_spin) {
+#if OMP_41_ENABLED
+ // The work queue may be empty but there might be proxy tasks still executing
+ if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
+#else
+ if (final_spin)
+#endif
+ {
// First, decrement the #unfinished threads, if that has not already
// been done. This decrement might be to the spin location, and
// result in the termination condition being satisfied.
@@ -1793,7 +1924,13 @@ static inline int __kmp_execute_tasks_te
// Going on and finding a new victim to steal from is expensive, as it
// involves a lot of cache misses, so we definitely want to re-check the
// termination condition before doing that.
- if (final_spin) {
+#if OMP_41_ENABLED
+ // The work queue may be empty but there might be proxy tasks still executing
+ if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
+#else
+ if (final_spin)
+#endif
+ {
// First, decrement the #unfinished threads, if that has not already
// been done. This decrement might be to the spin location, and
// result in the termination condition being satisfied.
@@ -2073,8 +2210,8 @@ __kmp_realloc_task_threads_data( kmp_inf
__kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
// copy old data to new data
KMP_MEMCPY_S( (void *) new_data, nthreads * sizeof(kmp_thread_data_t),
- (void *) old_data,
- maxthreads * sizeof(kmp_taskdata_t *) );
+ (void *) old_data,
+ maxthreads * sizeof(kmp_taskdata_t *) );
#ifdef BUILD_TIED_TASK_STACK
// GEH: Figure out if this is the right thing to do
@@ -2194,6 +2331,9 @@ __kmp_allocate_task_team( kmp_info_t *th
}
TCW_4(task_team -> tt.tt_found_tasks, FALSE);
+#if OMP_41_ENABLED
+ TCW_4(task_team -> tt.tt_found_proxy_tasks, FALSE);
+#endif
task_team -> tt.tt_nproc = nthreads = team->t.t_nproc;
TCW_4( task_team -> tt.tt_unfinished_threads, nthreads );
@@ -2365,11 +2505,11 @@ __kmp_wait_to_unref_task_teams(void)
// an already created, unused one if it already exists.
// This may be called by any thread, but only for teams with # threads >1.
void
-__kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int both )
+__kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int both, int always )
{
KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
- if ( ( team->t.t_task_team[this_thr->th.th_task_state] == NULL ) && ( team->t.t_nproc > 1 ) ) {
+ if ( ( team->t.t_task_team[this_thr->th.th_task_state] == NULL ) && ( always || team->t.t_nproc > 1 ) ) {
// Allocate a new task team, which will be propagated to
// all of the worker threads after the barrier. As they
// spin in the barrier release phase, then will continue
@@ -2431,7 +2571,7 @@ __kmp_task_team_sync( kmp_info_t *this_t
//------------------------------------------------------------------------------
// __kmp_task_team_wait: Master thread waits for outstanding tasks after the
-// barrier gather phase. Only called by master thread if #threads in team > 1 !
+// barrier gather phase. Only called by master thread if #threads in team > 1 or if proxy tasks were created
void
__kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
USE_ITT_BUILD_ARG(void * itt_sync_obj)
@@ -2457,7 +2597,12 @@ __kmp_task_team_wait( kmp_info_t *this_t
// The master thread is not included in the ref count.
KA_TRACE( 20, ( "__kmp_task_team_wait: Master T#%d deactivating task_team %p\n",
__kmp_gtid_from_thread( this_thr ), task_team ) );
+#if OMP_41_ENABLED
+ KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 || task_team->tt.tt_found_proxy_tasks == TRUE );
+ TCW_SYNC_4( task_team->tt.tt_found_proxy_tasks, FALSE );
+#else
KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 );
+#endif
TCW_SYNC_4( task_team->tt.tt_active, FALSE );
KMP_MB();
@@ -2505,3 +2650,176 @@ __kmp_tasking_barrier( kmp_team_t *team,
#endif /* USE_ITT_BUILD */
}
+
+#if OMP_41_ENABLED
+
+/* __kmp_give_task puts a task into a given thread queue if:
+ - the queue for that thread it was created
+ - there's space in that queue
+
+ Because of this, __kmp_push_task needs to check if there's space after getting the lock
+ */
+static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task )
+{
+ kmp_task_team_t * task_team = thread->th.th_task_team;
+ kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
+ kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+ bool result = false;
+
+ KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", taskdata, tid ) );
+
+ // assert tasking is enabled? what if not?
+ KMP_DEBUG_ASSERT( task_team != NULL );
+
+ if (thread_data -> td.td_deque == NULL ) {
+ // There's no queue in this thread, go find another one
+ // We're guaranteed that at least one thread has a queue
+ KA_TRACE(30, ("__kmp_give_task: thread %d has no queue while giving task %p.\n", tid, taskdata ) );
+ return result;
+ }
+
+ if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+ {
+ KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
+ return result;
+ }
+
+ __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
+
+ if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+ {
+ KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
+ goto release_and_exit;
+ }
+
+ thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
+ // Wrap index.
+ thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
+ TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
+
+ result = true;
+ KA_TRACE(30, ("__kmp_give_task: succesfully gave task %p to thread %d.\n", taskdata, tid ) );
+
+release_and_exit:
+ __kmp_release_bootstrap_lock( & thread_data-> td.td_deque_lock );
+
+ return result;
+}
+
+
+/* The finish of the a proxy tasks is divided in two pieces:
+ - the top half is the one that can be done from a thread outside the team
+ - the bottom half must be run from a them within the team
+
+ In order to run the bottom half the task gets queued back into one of the threads of the team.
+ Once the td_incomplete_child_task counter of the parent is decremented the threads can leave the barriers.
+ So, the bottom half needs to be queued before the counter is decremented. The top half is therefore divided in two parts:
+ - things that can be run before queuing the bottom half
+ - things that must be run after queuing the bottom half
+
+ This creates a second race as the bottom half can free the task before the second top half is executed. To avoid this
+ we use the td_incomplete_child_task of the proxy task to synchronize the top and bottom half.
+*/
+
+static void __kmp_first_top_half_finish_proxy( kmp_taskdata_t * taskdata )
+{
+ KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
+ KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
+ KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
+ KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
+
+ taskdata -> td_flags.complete = 1; // mark the task as completed
+
+ if ( taskdata->td_taskgroup )
+ KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
+
+ // Create an imaginary children for this task so the bottom half cannot release the task before we have completed the second top half
+ TCR_4(taskdata->td_incomplete_child_tasks++);
+}
+
+static void __kmp_second_top_half_finish_proxy( kmp_taskdata_t * taskdata )
+{
+ kmp_int32 children = 0;
+
+ // Predecrement simulated by "- 1" calculation
+ children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
+ KMP_DEBUG_ASSERT( children >= 0 );
+
+ // Remove the imaginary children
+ TCR_4(taskdata->td_incomplete_child_tasks--);
+}
+
+static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask )
+{
+ kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
+ kmp_info_t * thread = __kmp_threads[ gtid ];
+
+ KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
+ KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 1 ); // top half must run before bottom half
+
+ // We need to wait to make sure the top half is finished
+ // Spinning here should be ok as this should happen quickly
+ while ( TCR_4(taskdata->td_incomplete_child_tasks) > 0 ) ;
+
+ __kmp_release_deps(gtid,taskdata);
+ __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+}
+
+/*!
+ at ingroup TASKING
+ at param gtid Global Thread ID of encountering thread
+ at param ptask Task which execution is completed
+
+Execute the completation of a proxy task from a thread of that is part of the team. Run first and bottom halves directly.
+*/
+void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask )
+{
+ KMP_DEBUG_ASSERT( ptask != NULL );
+ kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
+ KA_TRACE(10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", gtid, taskdata ) );
+
+ KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
+
+ __kmp_first_top_half_finish_proxy(taskdata);
+ __kmp_second_top_half_finish_proxy(taskdata);
+ __kmp_bottom_half_finish_proxy(gtid,ptask);
+
+ KA_TRACE(10, ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", gtid, taskdata ) );
+}
+
+/*!
+ at ingroup TASKING
+ at param ptask Task which execution is completed
+
+Execute the completation of a proxy task from a thread that could not belong to the team.
+*/
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
+{
+ KMP_DEBUG_ASSERT( ptask != NULL );
+ kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
+
+ KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", taskdata ) );
+
+ KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
+
+ __kmp_first_top_half_finish_proxy(taskdata);
+
+ // Enqueue task to complete bottom half completation from a thread within the corresponding team
+ kmp_team_t * team = taskdata->td_team;
+ kmp_int32 nthreads = team->t.t_nproc;
+ kmp_info_t *thread;
+ kmp_int32 k = 0;
+
+ do {
+ //This should be similiar to k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
+ //For now we're just linearly trying to find a thread
+ k = (k+1) % nthreads;
+ thread = team->t.t_threads[k];
+ } while ( !__kmp_give_task( thread, k, ptask ) );
+
+ __kmp_second_top_half_finish_proxy(taskdata);
+
+ KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
+}
+
+#endif
More information about the Openmp-commits
mailing list