[Openmp-commits] [openmp] r333225 - [CUDA]Fix dynamic|guided scheduling.

George Rokos via Openmp-commits openmp-commits at lists.llvm.org
Thu May 24 14:12:42 PDT 2018


Author: grokos
Date: Thu May 24 14:12:41 2018
New Revision: 333225

URL: http://llvm.org/viewvc/llvm-project?rev=333225&view=rev
Log:
[CUDA]Fix dynamic|guided scheduling.

The existing implementation of the dynamic scheduling
breaks the contract introduced by the original openmp
runtime and, thus, is incorrect. Patch fixes it and
introduces correct dynamic scheduling model.

Thanks to Alexey Bataev for submitting this patch.

Differential Revision: https://reviews.llvm.org/D47333


Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=333225&r1=333224&r2=333225&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Thu May 24 14:12:41 2018
@@ -215,7 +215,8 @@ public:
            schedule <= kmp_sched_ordered_last;
   }
 
-  INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
+  INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
+                                   kmp_sched_t schedule, T lb, T ub, ST st,
                                    ST chunk) {
     int tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
@@ -282,18 +283,15 @@ public:
              "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
     }
 
-    // save sched state
-    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-
     // init schedules
     if (schedule == kmp_sched_static_chunk) {
       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
       int lastiter = 0;
       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
@@ -301,8 +299,8 @@ public:
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
@@ -310,11 +308,12 @@ public:
 
     } else if (schedule == kmp_sched_static_nochunk) {
       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
       int lastiter = 0;
       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
@@ -322,45 +321,50 @@ public:
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
-            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             omptarget_nvptx_threadPrivateContext->Stride(tid));
 
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      if (chunk < 1)
-        chunk = 1;
-      Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
-      // but each thread (but one) must discover that it is last
-      eventNum += tnum;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      if (GetThreadIdInBlock() == 0) {
+        if (chunk < 1)
+          chunk = 1;
+        int teamId = GetOmpTeamId();
+        omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+      }
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
-            "dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", "
-            "events number = %llu\n",
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid),
-            omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
+            omptarget_nvptx_threadPrivateContext->Chunk(teamId));
     }
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   // Support for dispatch next
 
-  INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
-                                     Counter priv, T &lb, T &ub,
-                                     Counter &chunkId, Counter &currentEvent,
-                                     T chunkSize, T loopUpperBound) {
-    // get next event atomically
-    Counter nextEvent = cg.Next();
-    // calculate chunk Id (priv was initialized upon entering the loop to
-    // 'start' == 'event')
-    chunkId = nextEvent - priv;
+  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+                                     Counter &loopLowerBound,
+                                     T loopUpperBound) {
     // calculate lower bound for all lanes in the warp
-    lb = chunkId * chunkSize; // this code assume normalization of LB
+    lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
     ub = lb + chunkSize - 1;  // Clang uses i <= ub
 
     // 3 result cases:
@@ -368,9 +372,8 @@ public:
     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
     //  NOT_FINISHED
     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    currentEvent = nextEvent;
     // a.
-    if (ub <= loopUpperBound) {
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
             P64(ub), P64(loopUpperBound));
       return NOT_FINISHED;
@@ -383,7 +386,8 @@ public:
       return LAST_CHUNK;
     }
     // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 1;
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
     PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
           P64(ub), P64(loopUpperBound));
     return FINISHED;
@@ -437,29 +441,18 @@ public:
     ASSERT0(LT_FUSSY,
             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
             "bad sched");
-    omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
     T myLb, myUb;
-    Counter chunkId;
-    // xxx current event is now local
-    omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
+    int teamId = GetOmpTeamId();
     int finished = DynamicNextChunk(
-        cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
-        chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
-        omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
-    if (finished == FINISHED) {
-      cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                  omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
-      cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                 omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
 
+    if (finished == FINISHED)
       return DISPATCH_FINISHED;
-    }
 
     // not finished (either not finished or last chunk)
-    *plast = (int32_t)(
-        myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+    *plast = (int32_t)(finished == LAST_CHUNK);
     *plower = myLb;
     *pupper = myUb;
     *pstride = 1;
@@ -491,7 +484,7 @@ EXTERN void __kmpc_dispatch_init_4(kmp_I
                                    int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
@@ -499,7 +492,7 @@ EXTERN void __kmpc_dispatch_init_4u(kmp_
                                     int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
@@ -507,7 +500,7 @@ EXTERN void __kmpc_dispatch_init_8(kmp_I
                                    int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
@@ -515,7 +508,7 @@ EXTERN void __kmpc_dispatch_init_8u(kmp_
                                     int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 // next




More information about the Openmp-commits mailing list