[Openmp-commits] [openmp] r275052 - http://reviews.llvm.org/D22134: Implementation of OpenMP 4.5 nonmonotonic schedule modifier
Andrey Churbanov via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jul 11 03:44:57 PDT 2016
Author: achurbanov
Date: Mon Jul 11 05:44:57 2016
New Revision: 275052
URL: http://llvm.org/viewvc/llvm-project?rev=275052&view=rev
Log:
http://reviews.llvm.org/D22134: Implementation of OpenMP 4.5 nonmonotonic schedule modifier
Modified:
openmp/trunk/runtime/src/kmp.h
openmp/trunk/runtime/src/kmp_dispatch.cpp
openmp/trunk/runtime/src/kmp_settings.c
Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=275052&r1=275051&r2=275052&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Mon Jul 11 05:44:57 2016
@@ -1553,7 +1553,7 @@ struct shared_table {
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
-#ifdef KMP_STATIC_STEAL_ENABLED
+#if KMP_STATIC_STEAL_ENABLED
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 count;
kmp_int32 ub;
@@ -1728,7 +1728,10 @@ typedef struct kmp_disp {
#if OMP_45_ENABLED
kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
- kmp_int64 *th_doacross_info; // info on loop bounds
+ union { // we can use union here because doacross cannot be used in nonmonotonic loops
+ kmp_int64 *th_doacross_info; // info on loop bounds
+ kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
+ };
#else
void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
#endif
Modified: openmp/trunk/runtime/src/kmp_dispatch.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_dispatch.cpp?rev=275052&r1=275051&r2=275052&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_dispatch.cpp (original)
+++ openmp/trunk/runtime/src/kmp_dispatch.cpp Mon Jul 11 05:44:57 2016
@@ -25,6 +25,12 @@
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
+// Need to raise Win version from XP to Vista here for support of InterlockedExchange64
+#if defined(_WIN32_WINNT) && defined(_M_IX86)
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x0502
+#endif
+
#include "kmp.h"
#include "kmp_i18n.h"
#include "kmp_itt.h"
@@ -71,7 +77,7 @@ struct i_maxmin< unsigned long long > {
};
//-------------------------------------------------------------------------
-#ifdef KMP_STATIC_STEAL_ENABLED
+#if KMP_STATIC_STEAL_ENABLED
// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
template< typename T >
@@ -667,13 +673,13 @@ __kmp_dispatch_init(
( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
}
- /* Currently just ignore the monotonic and non-monotonic modifiers (the compiler isn't producing them
- * yet anyway).
- * When it is we'll want to look at them somewhere here and use that information to add to our
- * schedule choice. We shouldn't need to pass them on, they merely affect which schedule we can
- * legally choose for various dynamic cases. (In paritcular, whether or not a stealing scheme is legal).
- */
- schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+ #if ( KMP_STATIC_STEAL_ENABLED )
+ if ( SCHEDULE_HAS_NONMONOTONIC(schedule) )
+ // AC: we now have only one implementation of stealing, so use it
+ schedule = kmp_sch_static_steal;
+ else
+ #endif
+ schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
/* Pick up the nomerge/ordered bits from the scheduling type */
if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
@@ -839,7 +845,7 @@ __kmp_dispatch_init(
}
switch ( schedule ) {
- #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+ #if ( KMP_STATIC_STEAL_ENABLED )
case kmp_sch_static_steal:
{
T nproc = th->th.th_team_nproc;
@@ -861,8 +867,19 @@ __kmp_dispatch_init(
pr->u.p.parm2 = lb;
//pr->pfields.parm3 = 0; // it's not used in static_steal
- pr->u.p.parm4 = id;
+ pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
pr->u.p.st = st;
+ if ( ___kmp_size_type > 4 ) {
+ // AC: TODO: check if 16-byte CAS available and use it to
+ // improve performance (probably wait for explicit request
+ // before spending time on this).
+ // For now use dynamically allocated per-thread lock,
+ // free memory in __kmp_dispatch_next when status==0.
+ KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
+ th->th.th_dispatch->th_steal_lock =
+ (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t));
+ __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
+ }
break;
} else {
KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
@@ -1222,7 +1239,6 @@ __kmp_dispatch_init(
}
#endif
#if ( KMP_STATIC_STEAL_ENABLED )
- if ( ___kmp_size_type < 8 ) {
// It cannot be guaranteed that after execution of a loop with some other schedule kind
// all the parm3 variables will contain the same value.
// Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
@@ -1234,8 +1250,7 @@ __kmp_dispatch_init(
volatile T * p = &pr->u.p.static_steal_counter;
*p = *p + 1;
}
- }
- #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
+ #endif // ( KMP_STATIC_STEAL_ENABLED )
#if OMPT_SUPPORT && OMPT_TRACE
if (ompt_enabled &&
@@ -1423,7 +1438,7 @@ __kmp_dispatch_next(
typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;
typedef typename traits_t< T >::floating_t DBL;
-#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+#if ( KMP_STATIC_STEAL_ENABLED )
static const int ___kmp_size_type = sizeof( UT );
#endif
@@ -1582,21 +1597,97 @@ __kmp_dispatch_next(
status = 0;
} else {
switch (pr->schedule) {
- #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+ #if ( KMP_STATIC_STEAL_ENABLED )
case kmp_sch_static_steal:
{
T chunk = pr->u.p.parm1;
+ int nproc = th->th.th_team_nproc;
KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
trip = pr->u.p.tc - 1;
if ( ___kmp_size_type > 4 ) {
- // Other threads do not look into the data of this thread,
- // so it's not necessary to make volatile casting.
- init = ( pr->u.p.count )++;
- status = ( init < (UT)pr->u.p.ub );
+ // use lock for 8-byte and CAS for 4-byte induction
+ // variable. TODO (optional): check and use 16-byte CAS
+ kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock;
+ KMP_DEBUG_ASSERT(lck != NULL);
+ if( pr->u.p.count < (UT)pr->u.p.ub ) {
+ __kmp_acquire_lock(lck, gtid);
+ // try to get own chunk of iterations
+ init = ( pr->u.p.count )++;
+ status = ( init < (UT)pr->u.p.ub );
+ __kmp_release_lock(lck, gtid);
+ } else {
+ status = 0; // no own chunks
+ }
+ if( !status ) { // try to steal
+ kmp_info_t **other_threads = team->t.t_threads;
+ int while_limit = nproc; // nproc attempts to find a victim
+ int while_index = 0;
+ // TODO: algorithm of searching for a victim
+ // should be cleaned up and measured
+ while ( ( !status ) && ( while_limit != ++while_index ) ) {
+ T remaining;
+ T victimIdx = pr->u.p.parm4;
+ T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+ dispatch_private_info_template< T > * victim =
+ reinterpret_cast< dispatch_private_info_template< T >* >
+ (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
+ while( ( victim == NULL || victim == pr ||
+ ( *(volatile T*)&victim->u.p.static_steal_counter !=
+ *(volatile T*)&pr->u.p.static_steal_counter ) ) &&
+ oldVictimIdx != victimIdx )
+ {
+ victimIdx = (victimIdx + 1) % nproc;
+ victim = reinterpret_cast< dispatch_private_info_template< T >* >
+ (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
+ };
+ if( !victim ||
+ ( *(volatile T *)&victim->u.p.static_steal_counter !=
+ *(volatile T *)&pr->u.p.static_steal_counter ) )
+ {
+ continue; // try once more (nproc attempts in total)
+ // no victim is ready yet to participate in stealing
+ // because all victims are still in kmp_init_dispatch
+ }
+ if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) {
+ pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
+ continue; // not enough chunks to steal, goto next victim
+ }
+
+ lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+ KMP_ASSERT(lck != NULL);
+ __kmp_acquire_lock(lck, gtid);
+ limit = victim->u.p.ub; // keep initial ub
+ if( victim->u.p.count >= limit ||
+ (remaining = limit - victim->u.p.count) < 2 )
+ {
+ __kmp_release_lock(lck, gtid);
+ pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
+ continue; // not enough chunks to steal
+ }
+ // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1
+ if( remaining > 3 ) {
+ init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining
+ } else {
+ init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining
+ }
+ __kmp_release_lock(lck, gtid);
+
+ KMP_DEBUG_ASSERT(init + 1 <= limit);
+ pr->u.p.parm4 = victimIdx; // remember victim to steal from
+ status = 1;
+ while_index = 0;
+ // now update own count and ub with stolen range but init chunk
+ __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+ pr->u.p.count = init + 1;
+ pr->u.p.ub = limit;
+ __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+ } // while (search for victim)
+ } // if (try to find victim and steal)
} else {
+ // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
typedef union {
struct {
UT count;
@@ -1605,7 +1696,6 @@ __kmp_dispatch_next(
kmp_int64 b;
} union_i4;
// All operations on 'count' or 'ub' must be combined atomically together.
- // stealing implemented only for 4-byte indexes
{
union_i4 vold, vnew;
vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
@@ -1627,86 +1717,77 @@ __kmp_dispatch_next(
if( !status ) {
kmp_info_t **other_threads = team->t.t_threads;
- int while_limit = 10;
+ int while_limit = nproc; // nproc attempts to find a victim
int while_index = 0;
// TODO: algorithm of searching for a victim
// should be cleaned up and measured
while ( ( !status ) && ( while_limit != ++while_index ) ) {
union_i4 vold, vnew;
- kmp_int32 remaining; // kmp_int32 because KMP_I4 only
+ kmp_int32 remaining;
T victimIdx = pr->u.p.parm4;
- T oldVictimIdx = victimIdx;
- dispatch_private_info_template< T > * victim;
-
- do {
- if( !victimIdx ) {
- victimIdx = team->t.t_nproc - 1;
- } else {
- --victimIdx;
- }
+ T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+ dispatch_private_info_template< T > * victim =
+ reinterpret_cast< dispatch_private_info_template< T >* >
+ (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
+ while( (victim == NULL || victim == pr ||
+ (*(volatile T*)&victim->u.p.static_steal_counter !=
+ *(volatile T*)&pr->u.p.static_steal_counter)) &&
+ oldVictimIdx != victimIdx )
+ {
+ victimIdx = (victimIdx + 1) % nproc;
victim = reinterpret_cast< dispatch_private_info_template< T >* >
( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
- } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
- // TODO: think about a proper place of this test
- if ( ( !victim ) ||
- ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
- (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
- // TODO: delay would be nice
- continue;
- // the victim is not ready yet to participate in stealing
- // because the victim is still in kmp_init_dispatch
+ };
+ if( !victim ||
+ ( *(volatile T *)&victim->u.p.static_steal_counter !=
+ *(volatile T *)&pr->u.p.static_steal_counter ) )
+ {
+ continue; // try once more (nproc attempts in total)
+ // no victim is ready yet to participate in stealing
+ // because all victims are still in kmp_init_dispatch
}
- if ( oldVictimIdx == victimIdx ) {
- break;
- }
- pr->u.p.parm4 = victimIdx;
-
- while( 1 ) {
+ pr->u.p.parm4 = victimIdx; // new victim found
+ while( 1 ) { // CAS loop if victim has enough chunks to steal
vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
vnew = vold;
KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
- if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
- break;
+ if ( vnew.p.count >= (UT)vnew.p.ub ||
+ (remaining = vnew.p.ub - vnew.p.count) < 2 )
+ {
+ pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
+ break; // not enough chunks to steal, goto next victim
+ }
+ if( remaining > 3 ) {
+ vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining
+ } else {
+ vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
}
- vnew.p.ub -= (remaining >> 2);
KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
- #pragma warning( push )
- // disable warning on pointless comparison of unsigned with 0
- #pragma warning( disable: 186 )
- KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
- #pragma warning( pop )
// TODO: Should this be acquire or release?
if ( KMP_COMPARE_AND_STORE_ACQ64(
( volatile kmp_int64 * )&victim->u.p.count,
*VOLATILE_CAST(kmp_int64 *)&vold.b,
*VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
+ // stealing succedded
status = 1;
while_index = 0;
// now update own count and ub
+ init = vnew.p.ub;
+ vold.p.count = init + 1;
#if KMP_ARCH_X86
- // stealing executed on non-KMP_ARCH_X86 only
- // Atomic 64-bit write on ia32 is
- // unavailable, so we do this in steps.
- // This code is not tested.
- init = vold.p.count;
- pr->u.p.ub = 0;
- pr->u.p.count = init + 1;
- pr->u.p.ub = vnew.p.count;
+ KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b);
#else
- init = vnew.p.ub;
- vold.p.count = init + 1;
- // TODO: is it safe and enough?
- *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
- #endif // KMP_ARCH_X86
+ *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
+ #endif
break;
- } // if
- KMP_CPU_PAUSE();
- } // while (1)
- } // while
- } // if
- } // if
+ } // if (check CAS result)
+ KMP_CPU_PAUSE(); // CAS failed, repeate attempt
+ } // while (try to steal from particular victim)
+ } // while (search for victim)
+ } // if (try to find victim and steal)
+ } // if (4-byte induction variable)
if ( !status ) {
*p_lb = 0;
*p_ub = 0;
@@ -1748,7 +1829,7 @@ __kmp_dispatch_next(
} // if
break;
} // case
- #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
+ #endif // ( KMP_STATIC_STEAL_ENABLED )
case kmp_sch_static_balanced:
{
KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
@@ -2142,6 +2223,20 @@ __kmp_dispatch_next(
#endif
if ( (ST)num_done == th->th.th_team_nproc - 1 ) {
+ #if ( KMP_STATIC_STEAL_ENABLED )
+ if( pr->schedule == kmp_sch_static_steal && ___kmp_size_type > 4 ) {
+ int i;
+ kmp_info_t **other_threads = team->t.t_threads;
+ // loop complete, safe to destroy locks used for stealing
+ for( i = 0; i < th->th.th_team_nproc; ++i ) {
+ kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock;
+ KMP_ASSERT(lck != NULL);
+ __kmp_destroy_lock( lck );
+ __kmp_free( lck );
+ other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
+ }
+ }
+ #endif
/* NOTE: release this buffer to be reused */
KMP_MB(); /* Flush all pending memory write invalidates. */
Modified: openmp/trunk/runtime/src/kmp_settings.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_settings.c?rev=275052&r1=275051&r2=275052&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_settings.c (original)
+++ openmp/trunk/runtime/src/kmp_settings.c Mon Jul 11 05:44:57 2016
@@ -3543,9 +3543,8 @@ __kmp_stg_parse_omp_schedule( char const
__kmp_sched = kmp_sch_trapezoidal;
else if (!__kmp_strcasecmp_with_sentinel("static", value, ',')) /* STATIC */
__kmp_sched = kmp_sch_static;
-#ifdef KMP_STATIC_STEAL_ENABLED
- else if (KMP_ARCH_X86_64 &&
- !__kmp_strcasecmp_with_sentinel("static_steal", value, ','))
+#if KMP_STATIC_STEAL_ENABLED
+ else if (!__kmp_strcasecmp_with_sentinel("static_steal", value, ','))
__kmp_sched = kmp_sch_static_steal;
#endif
else {
More information about the Openmp-commits
mailing list