[Openmp-commits] [openmp] r340944 - [OPENMP][NVPTX] Lightweight runtime support for SPMD mode.

Alexey Bataev via Openmp-commits openmp-commits at lists.llvm.org
Wed Aug 29 10:35:09 PDT 2018


Author: abataev
Date: Wed Aug 29 10:35:09 2018
New Revision: 340944

URL: http://llvm.org/viewvc/llvm-project?rev=340944&view=rev
Log:
[OPENMP][NVPTX] Lightweight runtime support for SPMD mode.

Summary:
Implemented simple and lightweight runtime support for SPMD mode-based
constructs. It adds support for L2 sequential parallelism wihtout full
runtime support. Also, patch fixes some use cases for
uninitialized|lightweight runtime.

Reviewers: grokos, kkwli0, Hahnfeld, gtbercea

Subscribers: guansong, openmp-commits

Differential Revision: https://reviews.llvm.org/D51222

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Wed Aug 29 10:35:09 2018
@@ -79,7 +79,7 @@ __device__ static size_t AlignVal(size_t
 EXTERN void
 __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
                                            size_t InitialDataSize) {
-
+  assert(isRuntimeInitialized() && "Expected initialized runtime.");
   DSPRINT0(DSFLAG_INIT,
            "Entering __kmpc_initialize_data_sharing_environment\n");
 
@@ -331,6 +331,7 @@ __kmpc_get_data_sharing_environment_fram
 ////////////////////////////////////////////////////////////////////////////////
 
 INLINE void data_sharing_init_stack_common() {
+  assert(isRuntimeInitialized() && "Expected initialized runtime.");
   omptarget_nvptx_TeamDescr *teamDescr =
       &omptarget_nvptx_threadPrivateContext->TeamContext();
 
@@ -346,6 +347,7 @@ INLINE void data_sharing_init_stack_comm
 // initialization). This function is called only by the MASTER thread of each
 // team in non-SPMD mode.
 EXTERN void __kmpc_data_sharing_init_stack() {
+  assert(isRuntimeInitialized() && "Expected initialized runtime.");
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
@@ -357,6 +359,7 @@ EXTERN void __kmpc_data_sharing_init_sta
 // once at the beginning of a data sharing context (coincides with the kernel
 // initialization). This function is called in SPMD mode only.
 EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+  assert(isRuntimeInitialized() && "Expected initialized runtime.");
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h Wed Aug 29 10:35:09 2018
@@ -116,6 +116,8 @@ typedef enum kmp_sched_t {
   kmp_sched_runtime = 37,
   kmp_sched_auto = 38,
 
+  kmp_sched_static_balanced_chunk = 45,
+
   kmp_sched_static_ordered = 65,
   kmp_sched_static_nochunk_ordered = 66,
   kmp_sched_dynamic_ordered = 67,

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu Wed Aug 29 10:35:09 2018
@@ -31,6 +31,10 @@ EXTERN double omp_get_wtime(void) {
 }
 
 EXTERN void omp_set_num_threads(int num) {
+  // Ignore it for SPMD mode.
+  if (isSPMDMode())
+    return;
+  assert(isRuntimeInitialized() && "Expected initialized runtime.");
   PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
   if (num <= 0) {
     WARNING0(LW_INPUT, "expected positive num; ignore\n");
@@ -48,6 +52,12 @@ EXTERN int omp_get_num_threads(void) {
 }
 
 EXTERN int omp_get_max_threads(void) {
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    // We're already in parallel region.
+    return 1;  // default is 1 thread avail
+  }
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
   int rc = 1; // default is 1 thread avail
   if (!currTaskDescr->InParallelRegion()) {
@@ -60,6 +70,11 @@ EXTERN int omp_get_max_threads(void) {
 }
 
 EXTERN int omp_get_thread_limit(void) {
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return 0;  // default is 0
+  }
   // per contention group.. meaning threads in current team
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
   int rc = currTaskDescr->ThreadLimit();
@@ -82,9 +97,15 @@ EXTERN int omp_get_num_procs(void) {
 
 EXTERN int omp_in_parallel(void) {
   int rc = 0;
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
-  if (currTaskDescr->InParallelRegion()) {
-    rc = 1;
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    rc = 1;  // SPMD mode is always in parallel.
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    if (currTaskDescr->InParallelRegion()) {
+      rc = 1;
+    }
   }
   PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
   return rc;
@@ -102,6 +123,11 @@ EXTERN int omp_in_final(void) {
 
 EXTERN void omp_set_dynamic(int flag) {
   PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag);
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return;
+  }
 
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
   if (flag) {
@@ -113,6 +139,11 @@ EXTERN void omp_set_dynamic(int flag) {
 
 EXTERN int omp_get_dynamic(void) {
   int rc = 0;
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return rc;
+  }
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
   if (currTaskDescr->IsDynamic()) {
     rc = 1;
@@ -145,6 +176,11 @@ EXTERN int omp_get_max_active_levels(voi
 }
 
 EXTERN int omp_get_level(void) {
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+  }
   int level = 0;
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
   ASSERT0(LT_FUSSY, currTaskDescr,
@@ -160,6 +196,11 @@ EXTERN int omp_get_level(void) {
 }
 
 EXTERN int omp_get_active_level(void) {
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return 1;
+  }
   int level = 0; // no active level parallelism
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
   ASSERT0(LT_FUSSY, currTaskDescr,
@@ -177,6 +218,11 @@ EXTERN int omp_get_active_level(void) {
 }
 
 EXTERN int omp_get_ancestor_thread_num(int level) {
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return level == 1 ? GetThreadIdInBlock() : 0;
+  }
   int rc = 0; // default at level 0
   if (level >= 0) {
     int totLevel = omp_get_level();
@@ -220,6 +266,11 @@ EXTERN int omp_get_ancestor_thread_num(i
 }
 
 EXTERN int omp_get_team_size(int level) {
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return level == 1 ? GetNumberOfThreadsInBlock() : 1;
+  }
   int rc = 1; // default at level 0
   if (level >= 0) {
     int totLevel = omp_get_level();
@@ -247,9 +298,16 @@ EXTERN int omp_get_team_size(int level)
 }
 
 EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
-  *kind = currTaskDescr->GetRuntimeSched();
-  *modifier = currTaskDescr->RuntimeChunkSize();
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    *kind = omp_sched_static;
+    *modifier = 1;
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    *kind = currTaskDescr->GetRuntimeSched();
+    *modifier = currTaskDescr->RuntimeChunkSize();
+  }
   PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
         (int)*kind, *modifier);
 }
@@ -257,6 +315,11 @@ EXTERN void omp_get_schedule(omp_sched_t
 EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
   PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
         modifier);
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() &&
+           "expected SPMD mode only with uninitialized runtime.");
+    return;
+  }
   if (kind >= omp_sched_static && kind < omp_sched_auto) {
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
     currTaskDescr->SetRuntimeSched(kind);

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Wed Aug 29 10:35:09 2018
@@ -131,7 +131,7 @@ public:
     ST stride = *pstride;
     T entityId, numberOfEntities;
     // init
-    switch (schedtype) {
+    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
     case kmp_sched_static_chunk: {
       if (chunk > 0) {
         entityId =
@@ -143,6 +143,28 @@ public:
         break;
       }
     } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_balanced_chunk: {
+      if (chunk > 0) {
+        entityId =
+            GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+        numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+                                                 IsRuntimeUninitialized);
+
+        // round up to make sure the chunk is enough to cover all iterations
+        T tripCount = ub - lb + 1; // +1 because ub is inclusive
+        T span = (tripCount + numberOfEntities - 1) / numberOfEntities;
+        // perform chunk adjustment
+        chunk = (span + chunk - 1) & ~(chunk - 1);
+
+        assert(ub >= lb && "ub must be >= lb.");
+        T oldUb = ub;
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+                       numberOfEntities);
+        if (ub > oldUb)
+          ub = oldUb;
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
     case kmp_sched_static_nochunk: {
       entityId =
           GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
@@ -199,12 +221,13 @@ public:
     *plower = lb;
     *pupper = ub;
     *pstride = stride;
-    PRINT(LD_LOOP,
-          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
-          GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                IsRuntimeUninitialized),
-          GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
-          P64(*pstride));
+    PRINT(
+        LD_LOOP,
+        "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+        "%d\n",
+        GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
+        GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
+        lastiter);
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -218,6 +241,8 @@ public:
   INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
                                    kmp_sched_t schedule, T lb, T ub, ST st,
                                    ST chunk) {
+    assert(isRuntimeInitialized() &&
+           "Expected non-SPMD mode + initialized runtime.");
     int tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
     T tnum = currTaskDescr->ThreadsInTeam();
@@ -308,7 +333,38 @@ public:
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             omptarget_nvptx_threadPrivateContext->Stride(tid));
+    } else if (schedule == kmp_sched_static_balanced_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      // round up to make sure the chunk is enough to cover all iterations
+      T span = (tripCount + tnum - 1) / tnum;
+      // perform chunk adjustment
+      chunk = (span + chunk - 1) & ~(chunk - 1);
 
+      T oldUb = ub;
+      ForStaticChunk(
+          lastiter, lb, ub, stride, chunk,
+          GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
+      assert(ub >= lb && "ub must be >= lb.");
+      if (ub > oldUb)
+        ub = oldUb;
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->Stride(tid));
     } else if (schedule == kmp_sched_static_nochunk) {
       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
       // save sched state
@@ -398,6 +454,8 @@ public:
   // in a warp cannot make independent progress.
   NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
                                     ST *pstride) {
+    assert(isRuntimeInitialized() &&
+           "Expected non-SPMD mode + initialized runtime.");
     // ID of a thread in its own warp
 
     // automatically selects thread or warp ID based on selected implementation
@@ -458,10 +516,11 @@ public:
     *pstride = 1;
 
     PRINT(LD_LOOP,
-          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
+          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+          "last %d\n",
           GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
-          GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
-          P64(*pstride));
+          GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
+          *plast);
     return DISPATCH_NOTFINISHED;
   }
 
@@ -736,6 +795,8 @@ INLINE void syncWorkersInGenericMode(uin
 EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid,
                                                   int32_t varNum, void *array) {
   PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+  assert(isRuntimeInitialized() &&
+         "Expected non-SPMD mode + initialized runtime.");
 
   omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
   int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(),

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu Wed Aug 29 10:35:09 2018
@@ -27,10 +27,17 @@ __device__
     omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
         omptarget_nvptx_device_State[MAX_SM];
 
+__device__ omptarget_nvptx_Queue<omptarget_nvptx_SimpleThreadPrivateContext,
+                                 OMP_STATE_COUNT>
+    omptarget_nvptx_device_simpleState[MAX_SM];
+
 // Pointer to this team's OpenMP state object
 __device__ __shared__
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
 
+__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
+    *omptarget_nvptx_simpleThreadPrivateContext;
+
 ////////////////////////////////////////////////////////////////////////////////
 // The team master sets the outlined parallel function in this variable to
 // communicate with the workers.  Since it is in shared memory, there is one

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Wed Aug 29 10:35:09 2018
@@ -24,6 +24,13 @@ extern __device__
 extern __device__ __shared__
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
 
+extern __device__ omptarget_nvptx_Queue<
+    omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
+    omptarget_nvptx_device_simpleState[MAX_SM];
+
+extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
+    *omptarget_nvptx_simpleThreadPrivateContext;
+
 //
 // The team master sets the outlined function and its arguments in these
 // variables to communicate with the workers.  Since they are in shared memory,
@@ -53,12 +60,7 @@ EXTERN void __kmpc_kernel_init_params(vo
 EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
   PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
         OMPTARGET_NVPTX_VERSION);
-
-  if (!RequiresOMPRuntime) {
-    // If OMP runtime is not required don't initialize OMP state.
-    setExecutionParameters(Generic, RuntimeUninitialized);
-    return;
-  }
+  assert(RequiresOMPRuntime && "Generic always requires initialized runtime.");
   setExecutionParameters(Generic, RuntimeInitialized);
 
   int threadIdInBlock = GetThreadIdInBlock();
@@ -95,16 +97,16 @@ EXTERN void __kmpc_kernel_init(int Threa
 }
 
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
-  if (IsOMPRuntimeInitialized) {
-    // Enqueue omp state object for use by another team.
+  assert(IsOMPRuntimeInitialized &&
+         "Generic always requires initialized runtime.");
+  // Enqueue omp state object for use by another team.
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
+  int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
 #else
-    int slot = smid() % MAX_SM;
+  int slot = smid() % MAX_SM;
 #endif
-    omptarget_nvptx_device_State[slot].Enqueue(
-        omptarget_nvptx_threadPrivateContext);
-  }
+  omptarget_nvptx_device_State[slot].Enqueue(
+      omptarget_nvptx_threadPrivateContext);
   // Done with work.  Kill the workers.
   omptarget_nvptx_workFn = 0;
 }
@@ -116,6 +118,13 @@ EXTERN void __kmpc_spmd_kernel_init(int
   if (!RequiresOMPRuntime) {
     // If OMP runtime is not required don't initialize OMP state.
     setExecutionParameters(Spmd, RuntimeUninitialized);
+    if (GetThreadIdInBlock() == 0) {
+      int slot = smid() % MAX_SM;
+      omptarget_nvptx_simpleThreadPrivateContext =
+          omptarget_nvptx_device_simpleState[slot].Dequeue();
+    }
+    __syncthreads();
+    omptarget_nvptx_simpleThreadPrivateContext->Init();
     return;
   }
   setExecutionParameters(Spmd, RuntimeInitialized);
@@ -180,6 +189,15 @@ EXTERN void __kmpc_spmd_kernel_deinit()
   // there are no more parallel regions in SPMD mode.
   __syncthreads();
   int threadId = GetThreadIdInBlock();
+  if (isRuntimeUninitialized()) {
+    if (threadId == 0) {
+      // Enqueue omp state object for use by another team.
+      int slot = smid() % MAX_SM;
+      omptarget_nvptx_device_simpleState[slot].Enqueue(
+          omptarget_nvptx_simpleThreadPrivateContext);
+      return;
+    }
+  }
   if (threadId == 0) {
     // Enqueue omp state object for use by another team.
     int slot = smid() % MAX_SM;

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Wed Aug 29 10:35:09 2018
@@ -395,6 +395,38 @@ struct omptarget_device_environmentTy {
   int32_t debug_level;
 };
 
+class omptarget_nvptx_SimpleThreadPrivateContext {
+  uint16_t par_level[MAX_THREADS_PER_TEAM];
+public:
+  INLINE void Init() {
+    assert(isSPMDMode() && isRuntimeUninitialized() &&
+           "Expected SPMD + uninitialized runtime modes.");
+    par_level[GetThreadIdInBlock()] = 0;
+  }
+  INLINE void IncParLevel() {
+    assert(isSPMDMode() && isRuntimeUninitialized() &&
+           "Expected SPMD + uninitialized runtime modes.");
+    ++par_level[GetThreadIdInBlock()];
+  }
+  INLINE void DecParLevel() {
+    assert(isSPMDMode() && isRuntimeUninitialized() &&
+           "Expected SPMD + uninitialized runtime modes.");
+    assert(par_level[GetThreadIdInBlock()] > 0 &&
+           "Expected parallel level >0.");
+    --par_level[GetThreadIdInBlock()];
+  }
+  INLINE bool InL2OrHigherParallelRegion() const {
+    assert(isSPMDMode() && isRuntimeUninitialized() &&
+           "Expected SPMD + uninitialized runtime modes.");
+    return par_level[GetThreadIdInBlock()] > 0;
+  }
+  INLINE uint16_t GetParallelLevel() const {
+    assert(isSPMDMode() && isRuntimeUninitialized() &&
+           "Expected SPMD + uninitialized runtime modes.");
+    return par_level[GetThreadIdInBlock()] + 1;
+  }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // global device envrionment
 ////////////////////////////////////////////////////////////////////////////////
@@ -409,6 +441,9 @@ extern __device__ omptarget_device_envir
 
 extern __device__ __shared__
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
+    *omptarget_nvptx_simpleThreadPrivateContext;
+
 extern __device__ __shared__ uint32_t execution_param;
 extern __device__ __shared__ void *ReductionScratchpadPtr;
 

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Wed Aug 29 10:35:09 2018
@@ -216,10 +216,9 @@ EXTERN void __kmpc_kernel_end_convergent
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
                                            int16_t IsOMPRuntimeInitialized) {
   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
-  omptarget_nvptx_workFn = WorkFn;
+  assert(IsOMPRuntimeInitialized && "expected initialized runtime.");
 
-  if (!IsOMPRuntimeInitialized)
-    return;
+  omptarget_nvptx_workFn = WorkFn;
 
   // This routine is only called by the team master.  The team master is
   // the first thread of the last warp.  It always has the logical thread
@@ -320,12 +319,11 @@ EXTERN bool __kmpc_kernel_parallel(void
                                    int16_t IsOMPRuntimeInitialized) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
 
+  assert(IsOMPRuntimeInitialized && "expected initialized runtime.");
+
   // Work function and arguments for L1 parallel region.
   *WorkFn = omptarget_nvptx_workFn;
 
-  if (!IsOMPRuntimeInitialized)
-    return true;
-
   // If this is the termination signal from the master, quit early.
   if (!*WorkFn)
     return false;
@@ -363,6 +361,8 @@ EXTERN bool __kmpc_kernel_parallel(void
 EXTERN void __kmpc_kernel_end_parallel() {
   // pop stack
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
+  assert(isRuntimeInitialized() && "expected initialized runtime.");
+
   // Only the worker threads call this routine and the master warp
   // never arrives here.  Therefore, use the nvptx thread id.
   int threadId = GetThreadIdInBlock();
@@ -378,6 +378,12 @@ EXTERN void __kmpc_kernel_end_parallel()
 EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
 
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+    omptarget_nvptx_simpleThreadPrivateContext->IncParLevel();
+    return;
+  }
+
   // assume this is only called for nested parallel
   int threadId = GetLogicalThreadIdInBlock();
 
@@ -392,7 +398,7 @@ EXTERN void __kmpc_serialized_parallel(k
   // it
   omptarget_nvptx_TaskDescr *newTaskDescr =
       (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
-                                              (char *)"new seq parallel task");
+                                              "new seq parallel task");
   newTaskDescr->CopyParent(currTaskDescr);
 
   // tweak values for serialized parallel case:
@@ -410,6 +416,12 @@ EXTERN void __kmpc_end_serialized_parall
                                            uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
 
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+    omptarget_nvptx_simpleThreadPrivateContext->DecParLevel();
+    return;
+  }
+
   // pop stack
   int threadId = GetLogicalThreadIdInBlock();
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
@@ -425,6 +437,11 @@ EXTERN void __kmpc_end_serialized_parall
 EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
 
+  if (isRuntimeUninitialized()) {
+    assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+    return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
+  }
+
   int threadId = GetLogicalThreadIdInBlock();
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
@@ -451,6 +468,7 @@ EXTERN int32_t __kmpc_global_thread_num(
 EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
                                     int32_t num_threads) {
   PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
+  assert(isRuntimeInitialized() && "Runtime must be initialized.");
   tid = GetLogicalThreadIdInBlock();
   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
       num_threads;
@@ -459,6 +477,7 @@ EXTERN void __kmpc_push_num_threads(kmp_
 EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
                                    int32_t simd_limit) {
   PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
+  assert(isRuntimeInitialized() && "Runtime must be initialized.");
   tid = GetLogicalThreadIdInBlock();
   omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
 }

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h Wed Aug 29 10:35:09 2018
@@ -101,9 +101,13 @@ INLINE int GetOmpThreadId(int threadId,
   int rc;
 
   if (isRuntimeUninitialized) {
-    rc = GetThreadIdInBlock();
-    if (!isSPMDExecutionMode && rc >= GetMasterThreadID())
+    assert(isSPMDExecutionMode && "Uninitialized runtime with non-SPMD mode.");
+    // For level 2 parallelism all parallel regions are executed sequentially.
+    if (omptarget_nvptx_simpleThreadPrivateContext
+            ->InL2OrHigherParallelRegion())
       rc = 0;
+    else
+      rc = GetThreadIdInBlock();
   } else {
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
@@ -118,8 +122,13 @@ INLINE int GetNumberOfOmpThreads(int thr
   int rc;
 
   if (isRuntimeUninitialized) {
-    rc = isSPMDExecutionMode ? GetNumberOfThreadsInBlock()
-                             : GetNumberOfThreadsInBlock() - WARPSIZE;
+    assert(isSPMDExecutionMode && "Uninitialized runtime with non-SPMD mode.");
+    // For level 2 parallelism all parallel regions are executed sequentially.
+    if (omptarget_nvptx_simpleThreadPrivateContext
+            ->InL2OrHigherParallelRegion())
+      rc = 1;
+    else
+      rc = GetNumberOfThreadsInBlock();
   } else {
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu Wed Aug 29 10:35:09 2018
@@ -42,10 +42,8 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
   if (isRuntimeUninitialized()) {
-    if (isSPMDMode())
-      __kmpc_barrier_simple_spmd(loc_ref, tid);
-    else
-      __kmpc_barrier_simple_generic(loc_ref, tid);
+    assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime.");
+    __kmpc_barrier_simple_spmd(loc_ref, tid);
   } else {
     tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr =

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu?rev=340944&r1=340943&r2=340944&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu Wed Aug 29 10:35:09 2018
@@ -81,6 +81,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps
                                          void *noAliasDepList) {
   PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
         P64(newKmpTaskDescr));
+  assert(isRuntimeInitialized() && "Runtime must be initialized.");
   // 1. get explict task descr from kmp task descr
   omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
       (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
@@ -117,6 +118,7 @@ EXTERN void __kmpc_omp_task_begin_if0(km
                                       kmp_TaskDescr *newKmpTaskDescr) {
   PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
         P64(newKmpTaskDescr));
+  assert(isRuntimeInitialized() && "Runtime must be initialized.");
   // 1. get explict task descr from kmp task descr
   omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
       (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
@@ -141,6 +143,7 @@ EXTERN void __kmpc_omp_task_complete_if0
                                          kmp_TaskDescr *newKmpTaskDescr) {
   PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
         P64(newKmpTaskDescr));
+  assert(isRuntimeInitialized() && "Runtime must be initialized.");
   // 1. get explict task descr from kmp task descr
   omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
       (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(




More information about the Openmp-commits mailing list