[Openmp-commits] [openmp] r271319 - Avoid deadlock with COI
Jonathan Peyton via Openmp-commits
openmp-commits at lists.llvm.org
Tue May 31 12:07:01 PDT 2016
Author: jlpeyton
Date: Tue May 31 14:07:00 2016
New Revision: 271319
URL: http://llvm.org/viewvc/llvm-project?rev=271319&view=rev
Log:
Avoid deadlock with COI
When an asynchronous offload task is completed, COI calls the runtime to queue
a "destructor task". When the task deques are full, a dead-lock situation
arises where the OpenMP threads are inside but cannot progress because the COI
thread is stuck inside the runtime trying to find a slot in a deque.
This patch implements the solution where the task deques doubled in size when
a task is being queued from a COI thread.
Differential Revision: http://reviews.llvm.org/D20733
Modified:
openmp/trunk/runtime/src/kmp.h
openmp/trunk/runtime/src/kmp_debugger.c
openmp/trunk/runtime/src/kmp_omp.h
openmp/trunk/runtime/src/kmp_tasking.c
Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=271319&r1=271318&r2=271319&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Tue May 31 14:07:00 2016
@@ -35,10 +35,6 @@
#define TASK_CURRENT_NOT_QUEUED 0
#define TASK_CURRENT_QUEUED 1
-#define TASK_DEQUE_BITS 8 // Used solely to define TASK_DEQUE_SIZE and TASK_DEQUE_MASK.
-#define TASK_DEQUE_SIZE ( 1 << TASK_DEQUE_BITS )
-#define TASK_DEQUE_MASK ( TASK_DEQUE_SIZE - 1 )
-
#ifdef BUILD_TIED_TASK_STACK
#define TASK_STACK_EMPTY 0 // entries when the stack is empty
@@ -2223,6 +2219,7 @@ typedef struct kmp_base_thread_data {
// Used only in __kmp_execute_tasks_template, maybe not avail until task is queued?
kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque
kmp_taskdata_t ** td_deque; // Deque of tasks encountered by td_thr, dynamically allocated
+ kmp_int32 td_deque_size; // Size of deck
kmp_uint32 td_deque_head; // Head of deque (will wrap)
kmp_uint32 td_deque_tail; // Tail of deque (will wrap)
kmp_int32 td_deque_ntasks; // Number of tasks in deque
@@ -2233,6 +2230,12 @@ typedef struct kmp_base_thread_data {
#endif // BUILD_TIED_TASK_STACK
} kmp_base_thread_data_t;
+#define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
+#define INITIAL_TASK_DEQUE_SIZE ( 1 << TASK_DEQUE_BITS )
+
+#define TASK_DEQUE_SIZE(td) ((td).td_deque_size)
+#define TASK_DEQUE_MASK(td) ((td).td_deque_size - 1)
+
typedef union KMP_ALIGN_CACHE kmp_thread_data {
kmp_base_thread_data_t td;
double td_align; /* use worst case alignment */
Modified: openmp/trunk/runtime/src/kmp_debugger.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_debugger.c?rev=271319&r1=271318&r2=271319&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_debugger.c (original)
+++ openmp/trunk/runtime/src/kmp_debugger.c Tue May 31 14:07:00 2016
@@ -93,7 +93,7 @@ __kmp_omp_debug_struct_info = {
sizeof( void * ),
OMP_LOCK_T_SIZE < sizeof(void *),
bs_last_barrier,
- TASK_DEQUE_SIZE,
+ INITIAL_TASK_DEQUE_SIZE,
// thread structure information
sizeof( kmp_base_info_t ),
@@ -222,6 +222,7 @@ __kmp_omp_debug_struct_info = {
// thread_data_t.
sizeof( kmp_thread_data_t ),
offset_and_size_of( kmp_base_thread_data_t, td_deque ),
+ offset_and_size_of( kmp_base_thread_data_t, td_deque_size ),
offset_and_size_of( kmp_base_thread_data_t, td_deque_head ),
offset_and_size_of( kmp_base_thread_data_t, td_deque_tail ),
offset_and_size_of( kmp_base_thread_data_t, td_deque_ntasks ),
Modified: openmp/trunk/runtime/src/kmp_omp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_omp.h?rev=271319&r1=271318&r2=271319&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_omp.h (original)
+++ openmp/trunk/runtime/src/kmp_omp.h Tue May 31 14:07:00 2016
@@ -218,6 +218,7 @@ typedef struct {
/* kmp_thread_data_t */
kmp_int32 hd_sizeof_struct;
offset_and_size_t hd_deque;
+ offset_and_size_t hd_deque_size;
offset_and_size_t hd_deque_head;
offset_and_size_t hd_deque_tail;
offset_and_size_t hd_deque_ntasks;
Modified: openmp/trunk/runtime/src/kmp_tasking.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_tasking.c?rev=271319&r1=271318&r2=271319&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_tasking.c (original)
+++ openmp/trunk/runtime/src/kmp_tasking.c Tue May 31 14:07:00 2016
@@ -305,7 +305,7 @@ __kmp_push_task(kmp_int32 gtid, kmp_task
}
// Check if deque is full
- if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+ if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
{
KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n",
gtid, taskdata ) );
@@ -317,7 +317,7 @@ __kmp_push_task(kmp_int32 gtid, kmp_task
#if OMP_41_ENABLED
// Need to recheck as we can get a proxy task from a thread outside of OpenMP
- if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+ if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
{
__kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
@@ -326,12 +326,12 @@ __kmp_push_task(kmp_int32 gtid, kmp_task
}
#else
// Must have room since no thread can add tasks but calling thread
- KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE );
+ KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE(thread_data->td) );
#endif
thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Push taskdata
// Wrap index.
- thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
+ thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1); // Adjust task count
__kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
@@ -1641,7 +1641,7 @@ __kmp_remove_my_task( kmp_info_t * threa
return NULL;
}
- tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index.
+ tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(thread_data->td); // Wrap index.
taskdata = thread_data -> td.td_deque[ tail ];
if (is_constrained) {
@@ -1735,10 +1735,10 @@ __kmp_steal_task( kmp_info_t *victim, km
if ( !is_constrained ) {
taskdata = victim_td -> td.td_deque[ victim_td -> td.td_deque_head ];
// Bump head pointer and Wrap.
- victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK;
+ victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK(victim_td->td);
} else {
// While we have postponed tasks let's steal from tail of the deque (smaller tasks)
- kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index.
+ kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(victim_td->td); // Wrap index.
taskdata = victim_td -> td.td_deque[ tail ];
// we need to check if the candidate obeys task scheduling constraint:
// only child of current task can be scheduled
@@ -2267,14 +2267,42 @@ __kmp_alloc_task_deque( kmp_info_t *thre
KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 );
KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
- __kmp_gtid_from_thread( thread ), TASK_DEQUE_SIZE, thread_data ) );
+ __kmp_gtid_from_thread( thread ), INITIAL_TASK_DEQUE_SIZE, thread_data ) );
// Allocate space for task deque, and zero the deque
// Cannot use __kmp_thread_calloc() because threads not around for
// kmp_reap_task_team( ).
thread_data -> td.td_deque = (kmp_taskdata_t **)
- __kmp_allocate( TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
+ __kmp_allocate( INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
+ thread_data -> td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
}
+//------------------------------------------------------------------------------
+// __kmp_realloc_task_deque:
+// Re-allocates a task deque for a particular thread, copies the content from the old deque
+// and adjusts the necessary data structures relating to the deque.
+// This operation must be done with a the deque_lock being held
+
+static void __kmp_realloc_task_deque ( kmp_info_t *thread, kmp_thread_data_t *thread_data )
+{
+ kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
+ kmp_int32 new_size = 2 * size;
+
+ KE_TRACE( 10, ( "__kmp_realloc_task_deque: T#%d reallocating deque[from %d to %d] for thread_data %p\n",
+ __kmp_gtid_from_thread( thread ), size, new_size, thread_data ) );
+
+ kmp_taskdata_t ** new_deque = (kmp_taskdata_t **) __kmp_allocate( new_size * sizeof(kmp_taskdata_t *));
+
+ int i,j;
+ for ( i = thread_data->td.td_deque_head, j = 0; j < size; i = (i+1) & TASK_DEQUE_MASK(thread_data->td), j++ )
+ new_deque[j] = thread_data->td.td_deque[i];
+
+ __kmp_free(thread_data->td.td_deque);
+
+ thread_data -> td.td_deque_head = 0;
+ thread_data -> td.td_deque_tail = size;
+ thread_data -> td.td_deque = new_deque;
+ thread_data -> td.td_deque_size = new_size;
+}
//------------------------------------------------------------------------------
// __kmp_free_task_deque:
@@ -2769,7 +2797,7 @@ __kmp_tasking_barrier( kmp_team_t *team,
Because of this, __kmp_push_task needs to check if there's space after getting the lock
*/
-static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task )
+static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task, kmp_int32 pass )
{
kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
kmp_task_team_t * task_team = taskdata->td_task_team;
@@ -2789,23 +2817,37 @@ static bool __kmp_give_task ( kmp_info_t
return result;
}
- if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
+ if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
{
KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
- return result;
- }
- __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
+ // if this deque is bigger than the pass ratio give a chance to another thread
+ if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass ) return result;
- if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE )
- {
- KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
- goto release_and_exit;
+ __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
+ __kmp_realloc_task_deque(thread,thread_data);
+
+ } else {
+
+ __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
+
+ if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
+ {
+ KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
+
+ // if this deque is bigger than the pass ratio give a chance to another thread
+ if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass )
+ goto release_and_exit;
+
+ __kmp_realloc_task_deque(thread,thread_data);
+ }
}
+ // lock is held here, and there is space in the deque
+
thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
// Wrap index.
- thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK;
+ thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
result = true;
@@ -2919,14 +2961,21 @@ void __kmpc_proxy_task_completed_ooo ( k
kmp_team_t * team = taskdata->td_team;
kmp_int32 nthreads = team->t.t_nproc;
kmp_info_t *thread;
- kmp_int32 k = 0;
+
+ //This should be similar to start_k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
+ kmp_int32 start_k = 0;
+ kmp_int32 pass = 1;
+ kmp_int32 k = start_k;
do {
- //This should be similar to k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
//For now we're just linearly trying to find a thread
- k = (k+1) % nthreads;
thread = team->t.t_threads[k];
- } while ( !__kmp_give_task( thread, k, ptask ) );
+ k = (k+1) % nthreads;
+
+ // we did a full pass through all the threads
+ if ( k == start_k ) pass = pass << 1;
+
+ } while ( !__kmp_give_task( thread, k, ptask, pass ) );
__kmp_second_top_half_finish_proxy(taskdata);
More information about the Openmp-commits
mailing list