[Openmp-commits] [openmp] r350524 - [OPENMP][NVPTX]Fix dynamic scheduling.

Alexey Bataev via Openmp-commits openmp-commits at lists.llvm.org
Mon Jan 7 06:25:25 PST 2019


Author: abataev
Date: Mon Jan  7 06:25:25 2019
New Revision: 350524

URL: http://llvm.org/viewvc/llvm-project?rev=350524&view=rev
Log:
[OPENMP][NVPTX]Fix dynamic scheduling.

Summary:
Previous implementation may cause the runtime crash when the number of
teams is > 1024. Patch fixes this problem + reduces number of the atomic
operations by 32 times.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jfb, openmp-commits, caomhin

Differential Revision: https://reviews.llvm.org/D56332

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=350524&r1=350523&r2=350524&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Mon Jan  7 06:25:25 2019
@@ -352,18 +352,18 @@ public:
                 omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
                 tid));
-
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      __kmpc_barrier(loc, threadId);
-      // save sched state
-      int teamId = GetOmpTeamId();
+      // save data
       omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      if (GetThreadIdInBlock() == 0) {
-        if (chunk < 1)
-          chunk = 1;
-        omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+      if (chunk < 1)
+        chunk = 1;
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+        __threadfence_block();
       }
       __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
@@ -371,21 +371,45 @@ public:
             ", chunk %" PRIu64 "\n",
             (int)tnum,
             (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
-            omptarget_nvptx_threadPrivateContext->Chunk(teamId));
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->Chunk(tid));
     }
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   // Support for dispatch next
 
+  INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
+    int lo, hi;
+    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+    hi = __SHFL_SYNC(active, hi, leader);
+    lo = __SHFL_SYNC(active, lo, leader);
+    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+    return val;
+  }
+
+  INLINE static uint64_t NextIter() {
+    unsigned int active = __ACTIVEMASK();
+    int leader = __ffs(active) - 1;
+    int change = __popc(active);
+    unsigned lane_mask_lt;
+    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
+    unsigned int rank = __popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomicAdd(
+          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+          change);
+    }
+    warp_res = Shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
   INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
-                                     int64_t &loopLowerBound,
-                                     T loopUpperBound) {
-    // calculate lower bound for all lanes in the warp
-    lb = atomicAdd((unsigned long long *)&loopLowerBound,
-                   (unsigned long long)chunkSize);
+                                     T loopLowerBound, T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
     ub = lb + chunkSize - 1;  // Clang uses i <= ub
 
     // 3 result cases:
@@ -461,11 +485,10 @@ public:
             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
             "bad sched");
     T myLb, myUb;
-    int teamId = GetOmpTeamId();
     int finished = DynamicNextChunk(
-        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
 
     if (finished == FINISHED)
       return DISPATCH_FINISHED;

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=350524&r1=350523&r2=350524&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Mon Jan  7 06:25:25 2019
@@ -344,6 +344,7 @@ public:
   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
 
   INLINE void InitThreadPrivateContext(int tid);
+  INLINE uint64_t &Cnt() { return cnt; }
 
 private:
   // team context for this team
@@ -366,6 +367,7 @@ private:
   // state for dispatch with dyn/guided OR static (never use both at a time)
   int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
   int64_t stride[MAX_THREADS_PER_TEAM];
+  uint64_t cnt;
 };
 
 /// Device envrionment data




More information about the Openmp-commits mailing list