[Openmp-commits] [openmp] r262532 - Add new OpenMP 4.5 doacross loop nest feature

Wed Mar 2 14:42:07 PST 2016

Author: jlpeyton
Date: Wed Mar  2 16:42:06 2016
New Revision: 262532

URL: http://llvm.org/viewvc/llvm-project?rev=262532&view=rev
Log:
Add new OpenMP 4.5 doacross loop nest feature

>From the standard: A doacross loop nest is a loop nest that has cross-iteration
dependence. An iteration is dependent on one or more lexicographically earlier
iterations. The ordered clause parameter on a loop directive identifies the
loop(s) associated with the doacross loop nest.

The init/fini routines allocate/free doacross buffer(s) for each loop for each
thread.  The wait routine waits for a flag designated by the dependence vector.
The post routine sets the flag designated by current iteration vector.  We use
a similar technique of shared buffer indices that covers up to 7 nowait loops
executed simultaneously by different threads (number 7 has no real meaning,
just heuristic value).  Also, the size of structures are kept intact via
reducing dummy arrays.

This needs to be put into the OpenMP runtime library in order for the compiler
team to develop the compiler side of the implementation.

Differential Revision: http://reviews.llvm.org/D17399

Modified:
    openmp/trunk/runtime/src/dllexports
    openmp/trunk/runtime/src/kmp.h
    openmp/trunk/runtime/src/kmp_csupport.c
    openmp/trunk/runtime/src/kmp_dispatch.cpp
    openmp/trunk/runtime/src/kmp_runtime.c

Modified: openmp/trunk/runtime/src/dllexports
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/dllexports?rev=262532&r1=262531&r2=262532&view=diff
==============================================================================

--- openmp/trunk/runtime/src/dllexports (original)
+++ openmp/trunk/runtime/src/dllexports Wed Mar  2 16:42:06 2016
@@ -389,6 +389,10 @@ kmpc_set_defaults
     %ifdef OMP_41
 	__kmpc_proxy_task_completed	    259
 	__kmpc_proxy_task_completed_ooo	    260
+        __kmpc_doacross_init                261
+        __kmpc_doacross_wait                262
+        __kmpc_doacross_post                263
+        __kmpc_doacross_fini                264
     %endif
 %endif
 

Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=262532&r1=262531&r2=262532&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Wed Mar  2 16:42:06 2016
@@ -1665,7 +1665,7 @@ typedef struct dispatch_shared_info64 {
     volatile kmp_uint64      iteration;
     volatile kmp_uint64      num_done;
     volatile kmp_uint64      ordered_iteration;
-    kmp_int64   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar
+    kmp_int64   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar
 } dispatch_shared_info64_t;
 
 typedef struct dispatch_shared_info {
@@ -1673,8 +1673,12 @@ typedef struct dispatch_shared_info {
         dispatch_shared_info32_t  s32;
         dispatch_shared_info64_t  s64;
     } u;
-/*    volatile kmp_int32      dispatch_abort;  depricated */
     volatile kmp_uint32     buffer_index;
+#if OMP_41_ENABLED
+    volatile kmp_int32      doacross_buf_idx;  // teamwise index
+    volatile kmp_uint32    *doacross_flags;    // shared array of iteration flags (0/1)
+    kmp_int32               doacross_num_done; // count finished threads
+#endif
 } dispatch_shared_info_t;
 
 typedef struct kmp_disp {
@@ -1688,7 +1692,13 @@ typedef struct kmp_disp {
 
     dispatch_private_info_t *th_disp_buffer;
     kmp_int32                th_disp_index;
+#if OMP_41_ENABLED
+    kmp_int32                th_doacross_buf_idx; // thread's doacross buffer index
+    volatile kmp_uint32     *th_doacross_flags;   // pointer to shared array of flags
+    kmp_int64               *th_doacross_info;    // info on loop bounds
+#else
     void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
+#endif
 #if KMP_USE_INTERNODE_ALIGNMENT
     char more_padding[INTERNODE_CACHE_LINE];
 #endif
@@ -3543,7 +3553,17 @@ KMP_EXPORT void __kmpc_push_num_threads(
 KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind );
 KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads );
 KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...);
-
+#endif
+#if OMP_41_ENABLED
+struct kmp_dim {  // loop bounds info casted to kmp_int64
+    kmp_int64 lo; // lower
+    kmp_int64 up; // upper
+    kmp_int64 st; // stride
+};
+KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32 num_dims, struct kmp_dim * dims);
+KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
 #endif
 
 KMP_EXPORT void*

Modified: openmp/trunk/runtime/src/kmp_csupport.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_csupport.c?rev=262532&r1=262531&r2=262532&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_csupport.c (original)
+++ openmp/trunk/runtime/src/kmp_csupport.c Wed Mar  2 16:42:06 2016
@@ -3049,5 +3049,294 @@ void __kmpc_place_threads(int nS, int sO
     __kmp_place_num_threads_per_core = nT;
 }
 
+#if OMP_41_ENABLED
+/*!
+ at ingroup WORK_SHARING
+ at param loc  source location information.
+ at param gtid  global thread number.
+ at param num_dims  number of associated doacross loops.
+ at param dims  info on loops bounds.
+
+Initialize doacross loop information.
+Expect compiler send us inclusive bounds,
+e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
+*/
+void
+__kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
+{
+    int j, idx;
+    kmp_int64 last, trace_count;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_uint32 *flags;
+    kmp_disp_t *pr_buf = th->th.th_dispatch;
+    dispatch_shared_info_t *sh_buf;
+
+    KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
+                 gtid, num_dims, !team->t.t_serialized));
+    KMP_DEBUG_ASSERT(dims != NULL);
+    KMP_DEBUG_ASSERT(num_dims > 0);
+
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+    KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+    idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
+    sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+
+    // Save bounds info into allocated private buffer
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
+    pr_buf->th_doacross_info =
+        (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
+    // Save also address of num_done in order to access it later without knowing the buffer index
+    pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
+    pr_buf->th_doacross_info[2] = dims[0].lo;
+    pr_buf->th_doacross_info[3] = dims[0].up;
+    pr_buf->th_doacross_info[4] = dims[0].st;
+    last = 5;
+    for( j = 1; j < num_dims; ++j ) {
+        kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
+        if( dims[j].st == 1 ) { // most common case
+            // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
+            range_length = dims[j].up - dims[j].lo + 1;
+        } else {
+            if( dims[j].st > 0 ) {
+                KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
+                range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
+            } else {            // negative increment
+                KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
+                range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
+            }
+        }
+        pr_buf->th_doacross_info[last++] = range_length;
+        pr_buf->th_doacross_info[last++] = dims[j].lo;
+        pr_buf->th_doacross_info[last++] = dims[j].up;
+        pr_buf->th_doacross_info[last++] = dims[j].st;
+    }
+
+    // Compute total trip count.
+    // Start with range of dims[0] which we don't need to keep in the buffer.
+    if( dims[0].st == 1 ) { // most common case
+        trace_count = dims[0].up - dims[0].lo + 1;
+    } else if( dims[0].st > 0 ) {
+        KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
+        trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
+    } else {   // negative increment
+        KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
+        trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
+    }
+    for( j = 1; j < num_dims; ++j ) {
+        trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
+    }
+    KMP_DEBUG_ASSERT(trace_count > 0);
+
+    // Check if shared buffer is not occupied by other loop (idx - KMP_MAX_DISP_BUF)
+    if( idx != sh_buf->doacross_buf_idx ) {
+        // Shared buffer is occupied, wait for it to be free
+        __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
+    }
+    // Check if we are the first thread. After the CAS the first thread gets 0,
+    // others get 1 if initialization is in progress, allocated pointer otherwise.
+    flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
+        (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
+    if( flags == NULL ) {
+        // we are the first thread, allocate the array of flags
+        kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+        sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
+    } else if( (kmp_int64)flags == 1 ) {
+        // initialization is still in progress, need to wait
+        while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
+            KMP_YIELD(TRUE);
+        }
+    }
+    KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
+    pr_buf->th_doacross_flags = sh_buf->doacross_flags;      // save private copy in order to not
+                                                             // touch shared buffer on each iteration
+    KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
+}
+
+void
+__kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
+{
+    kmp_int32 shft, num_dims, i;
+    kmp_uint32 flag;
+    kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf;
+    kmp_int64 lo, up, st;
+
+    KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+
+    // calculate sequential iteration number and check out-of-bounds condition
+    pr_buf = th->th.th_dispatch;
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    num_dims = pr_buf->th_doacross_info[0];
+    lo = pr_buf->th_doacross_info[2];
+    up = pr_buf->th_doacross_info[3];
+    st = pr_buf->th_doacross_info[4];
+    if( st == 1 ) { // most common case
+        if( vec[0] < lo || vec[0] > up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = vec[0] - lo;
+    } else if( st > 0 ) {
+        if( vec[0] < lo || vec[0] > up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = (kmp_uint64)(vec[0] - lo) / st;
+    } else {        // negative increment
+        if( vec[0] > lo || vec[0] < up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+    }
+    for( i = 1; i < num_dims; ++i ) {
+        kmp_int64 iter, ln;
+        kmp_int32 j = i * 4;
+        ln = pr_buf->th_doacross_info[j + 1];
+        lo = pr_buf->th_doacross_info[j + 2];
+        up = pr_buf->th_doacross_info[j + 3];
+        st = pr_buf->th_doacross_info[j + 4];
+        if( st == 1 ) {
+            if( vec[i] < lo || vec[i] > up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = vec[i] - lo;
+        } else if( st > 0 ) {
+            if( vec[i] < lo || vec[i] > up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = (kmp_uint64)(vec[i] - lo) / st;
+        } else {   // st < 0
+            if( vec[i] > lo || vec[i] < up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = (kmp_uint64)(lo - vec[i]) / (-st);
+        }
+        iter_number = iter + ln * iter_number;
+    }
+    shft = iter_number % 32; // use 32-bit granularity
+    iter_number >>= 5;       // divided by 32
+    flag = 1 << shft;
+    while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
+        KMP_YIELD(TRUE);
+    }
+    KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
+                 gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
+{
+    kmp_int32 shft, num_dims, i;
+    kmp_uint32 flag;
+    kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf;
+    kmp_int64 lo, st;
+
+    KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+
+    // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
+    pr_buf = th->th.th_dispatch;
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    num_dims = pr_buf->th_doacross_info[0];
+    lo = pr_buf->th_doacross_info[2];
+    st = pr_buf->th_doacross_info[4];
+    if( st == 1 ) { // most common case
+        iter_number = vec[0] - lo;
+    } else if( st > 0 ) {
+        iter_number = (kmp_uint64)(vec[0] - lo) / st;
+    } else {        // negative increment
+        iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+    }
+    for( i = 1; i < num_dims; ++i ) {
+        kmp_int64 iter, ln;
+        kmp_int32 j = i * 4;
+        ln = pr_buf->th_doacross_info[j + 1];
+        lo = pr_buf->th_doacross_info[j + 2];
+        st = pr_buf->th_doacross_info[j + 4];
+        if( st == 1 ) {
+            iter = vec[i] - lo;
+        } else if( st > 0 ) {
+            iter = (kmp_uint64)(vec[i] - lo) / st;
+        } else {   // st < 0
+            iter = (kmp_uint64)(lo - vec[i]) / (-st);
+        }
+        iter_number = iter + ln * iter_number;
+    }
+    shft = iter_number % 32; // use 32-bit granularity
+    iter_number >>= 5;       // divided by 32
+    flag = 1 << shft;
+    if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
+        KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
+    KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
+                 gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_fini(ident_t *loc, int gtid)
+{
+    kmp_int64 num_done;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf = th->th.th_dispatch;
+
+    KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
+        return; // nothing to do
+    }
+    num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
+    if( num_done == th->th.th_team_nproc ) {
+        // we are the last thread, need to free shared resources
+        int idx = pr_buf->th_doacross_buf_idx - 1;
+        dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+        KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
+        KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
+        KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
+        __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
+        sh_buf->doacross_flags = NULL;
+        sh_buf->doacross_num_done = 0;
+        sh_buf->doacross_buf_idx += KMP_MAX_DISP_BUF; // free buffer for future re-use
+    }
+    // free private resources (need to keep buffer index forever)
+    __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
+    pr_buf->th_doacross_info = NULL;
+    KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
+}
+#endif
+
 // end of file //
 

Modified: openmp/trunk/runtime/src/kmp_dispatch.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_dispatch.cpp?rev=262532&r1=262531&r2=262532&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_dispatch.cpp (original)
+++ openmp/trunk/runtime/src/kmp_dispatch.cpp Wed Mar  2 16:42:06 2016
@@ -163,7 +163,7 @@ struct dispatch_shared_infoXX_template {
     volatile UT     iteration;
     volatile UT     num_done;
     volatile UT     ordered_iteration;
-    UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
+    UT   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
 };
 
 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
@@ -175,6 +175,11 @@ struct dispatch_shared_info_template {
         dispatch_shared_info64_t               s64;
     } u;
     volatile kmp_uint32     buffer_index;
+#if OMP_41_ENABLED
+    volatile kmp_int32      doacross_buf_idx;  // teamwise index
+    kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
+    kmp_int32               doacross_num_done; // count finished threads
+#endif
 };
 
 /* ------------------------------------------------------------------------ */

Modified: openmp/trunk/runtime/src/kmp_runtime.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_runtime.c?rev=262532&r1=262531&r2=262532&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_runtime.c (original)
+++ openmp/trunk/runtime/src/kmp_runtime.c Wed Mar  2 16:42:06 2016
@@ -3046,8 +3046,12 @@ __kmp_allocate_team_arrays(kmp_team_t *t
     team->t.t_max_nproc = max_nth;
 
     /* setup dispatch buffers */
-    for(i = 0 ; i < num_disp_buff; ++i)
+    for(i = 0 ; i < num_disp_buff; ++i) {
         team->t.t_disp_buffer[i].buffer_index = i;
+#if OMP_41_ENABLED
+        team->t.t_disp_buffer[i].doacross_buf_idx = i;
+#endif
+    }
 }
 
 static void
@@ -4121,7 +4125,9 @@ __kmp_initialize_info( kmp_info_t *this_
         KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
 
         dispatch->th_disp_index = 0;
-
+#if OMP_41_ENABLED
+        dispatch->th_doacross_buf_idx = 0;
+#endif
         if( ! dispatch->th_disp_buffer )  {
             dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
 
@@ -6813,7 +6819,9 @@ __kmp_run_before_invoked_task( int gtid,
     //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
 
     dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
-
+#if OMP_41_ENABLED
+    dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
+#endif
     if( __kmp_env_consistency_check )
         __kmp_push_parallel( gtid, team->t.t_ident );
 
@@ -7050,10 +7058,17 @@ __kmp_internal_fork( ident_t *id, int gt
     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
     if ( team->t.t_max_nproc > 1 ) {
         int i;
-        for (i = 0; i <  KMP_MAX_DISP_BUF; ++i)
+        for (i = 0; i <  KMP_MAX_DISP_BUF; ++i) {
             team->t.t_disp_buffer[ i ].buffer_index = i;
+#if OMP_41_ENABLED
+            team->t.t_disp_buffer[i].doacross_buf_idx = i;
+#endif
+        }
     } else {
         team->t.t_disp_buffer[ 0 ].buffer_index = 0;
+#if OMP_41_ENABLED
+        team->t.t_disp_buffer[0].doacross_buf_idx = 0;
+#endif
     }
 
     KMP_MB();       /* Flush all pending memory write invalidates.  */