[Openmp-commits] [openmp] r359822 - [OPENMP][NVPTX]Improve code by using parallel level counter.

Alexey Bataev via Openmp-commits openmp-commits at lists.llvm.org
Thu May 2 13:05:02 PDT 2019


Author: abataev
Date: Thu May  2 13:05:01 2019
New Revision: 359822

URL: http://llvm.org/viewvc/llvm-project?rev=359822&view=rev
Log:
[OPENMP][NVPTX]Improve code by using parallel level counter.

Summary:
Previously for the different purposes we need to get the active/common
parallel level and with full runtime we iterated over all the records to
calculate this level. Instead, we can used the warp-based parallel level
counters used in no-runtime mode.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jfb, jdoerfert, caomhin, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D61395

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu Thu May  2 13:05:01 2019
@@ -47,8 +47,7 @@ EXTERN void omp_set_num_threads(int num)
 EXTERN int omp_get_num_threads(void) {
   bool isSPMDExecutionMode = isSPMDMode();
   int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  int rc =
-      GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
+  int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
   PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
   return rc;
 }
@@ -83,7 +82,7 @@ EXTERN int omp_get_thread_limit(void) {
 EXTERN int omp_get_thread_num() {
   bool isSPMDExecutionMode = isSPMDMode();
   int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized());
+  int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
   PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
   return rc;
 }
@@ -95,18 +94,7 @@ EXTERN int omp_get_num_procs(void) {
 }
 
 EXTERN int omp_in_parallel(void) {
-  int rc = 0;
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode only with uninitialized runtime.");
-    rc = 1;  // SPMD mode is always in parallel.
-  } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        getMyTopTaskDescriptor(isSPMDMode());
-    if (currTaskDescr->InParallelRegion()) {
-      rc = 1;
-    }
-  }
+  int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
   PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
   return rc;
 }
@@ -155,46 +143,13 @@ EXTERN int omp_get_max_active_levels(voi
 }
 
 EXTERN int omp_get_level(void) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode only with uninitialized runtime.");
-    // parallelLevel starts from 0, need to add 1 for correct level.
-    return parallelLevel[GetWarpId()] + 1;
-  }
-  int level = 0;
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      getMyTopTaskDescriptor(isSPMDMode());
-  ASSERT0(LT_FUSSY, currTaskDescr,
-          "do not expect fct to be called in a non-active thread");
-  do {
-    if (currTaskDescr->IsParallelConstruct()) {
-      level++;
-    }
-    currTaskDescr = currTaskDescr->GetPrevTaskDescr();
-  } while (currTaskDescr);
+  int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
   PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
   return level;
 }
 
 EXTERN int omp_get_active_level(void) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode only with uninitialized runtime.");
-    return 1;
-  }
-  int level = 0; // no active level parallelism
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      getMyTopTaskDescriptor(isSPMDMode());
-  ASSERT0(LT_FUSSY, currTaskDescr,
-          "do not expect fct to be called in a non-active thread");
-  do {
-    if (currTaskDescr->ThreadsInTeam() > 1) {
-      // has a parallel with more than one thread in team
-      level = 1;
-      break;
-    }
-    currTaskDescr = currTaskDescr->GetPrevTaskDescr();
-  } while (currTaskDescr);
+  int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
   PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
   return level;
 }

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Thu May  2 13:05:01 2019
@@ -95,8 +95,7 @@ public:
   INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
                                      int32_t *plastiter, T *plower, T *pupper,
                                      ST *pstride, ST chunk,
-                                     bool IsSPMDExecutionMode,
-                                     bool IsRuntimeUninitialized) {
+                                     bool IsSPMDExecutionMode) {
     // When IsRuntimeUninitialized is true, we assume that the caller is
     // in an L0 parallel region and that all worker threads participate.
 
@@ -104,8 +103,8 @@ public:
 
     // Assume we are in teams region or that we use a single block
     // per target region
-    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
-        tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+    ST numberOfActiveOMPThreads =
+        GetNumberOfOmpThreads(tid, IsSPMDExecutionMode);
 
     // All warps that are in excess of the maximum requested, do
     // not execute the loop
@@ -456,9 +455,7 @@ public:
 
     // automatically selects thread or warp ID based on selected implementation
     int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-    ASSERT0(LT_FUSSY,
-            gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
-                                         checkRuntimeUninitialized(loc)),
+    ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)),
             "current thread is not needed here; error");
     // retrieve schedule
     kmp_sched_t schedule =
@@ -509,13 +506,12 @@ public:
     *pupper = myUb;
     *pstride = 1;
 
-    PRINT(
-        LD_LOOP,
-        "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
-        "last %d\n",
-        (int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
-        (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper,
-        (long long)*pstride, (int)*plast);
+    PRINT(LD_LOOP,
+          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+          "last %d\n",
+          (int)GetNumberOfOmpThreads(tid, isSPMDMode()),
+          (int)GetNumberOfWorkersInTeam(), (long long)*plower,
+          (long long)*pupper, (long long)*pstride, (int)*plast);
     return DISPATCH_NOTFINISHED;
   }
 
@@ -629,7 +625,7 @@ EXTERN void __kmpc_for_static_init_4(kmp
   PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+      checkSPMDMode(loc));
 }
 
 EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
@@ -640,7 +636,7 @@ EXTERN void __kmpc_for_static_init_4u(km
   PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+      checkSPMDMode(loc));
 }
 
 EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
@@ -651,7 +647,7 @@ EXTERN void __kmpc_for_static_init_8(kmp
   PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+      checkSPMDMode(loc));
 }
 
 EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
@@ -662,7 +658,7 @@ EXTERN void __kmpc_for_static_init_8u(km
   PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+      checkSPMDMode(loc));
 }
 
 EXTERN
@@ -674,7 +670,7 @@ void __kmpc_for_static_init_4_simple_spm
   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/true);
 }
 
 EXTERN
@@ -686,7 +682,7 @@ void __kmpc_for_static_init_4u_simple_sp
   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/true);
 }
 
 EXTERN
@@ -698,7 +694,7 @@ void __kmpc_for_static_init_8_simple_spm
   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/true);
 }
 
 EXTERN
@@ -710,7 +706,7 @@ void __kmpc_for_static_init_8u_simple_sp
   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/true);
 }
 
 EXTERN
@@ -721,7 +717,7 @@ void __kmpc_for_static_init_4_simple_gen
   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/false);
 }
 
 EXTERN
@@ -732,7 +728,7 @@ void __kmpc_for_static_init_4u_simple_ge
   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/false);
 }
 
 EXTERN
@@ -743,7 +739,7 @@ void __kmpc_for_static_init_8_simple_gen
   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/false);
 }
 
 EXTERN
@@ -754,7 +750,7 @@ void __kmpc_for_static_init_8u_simple_ge
   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true);
+      /*IsSPMDExecutionMode=*/false);
 }
 
 EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
@@ -787,8 +783,7 @@ EXTERN void __kmpc_reduce_conditional_la
 
   omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
   int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
-                                              checkRuntimeUninitialized(loc));
+  uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc));
   uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
   for (unsigned i = 0; i < varNum; i++) {
     // Reset buffer.

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Thu May  2 13:05:01 2019
@@ -43,6 +43,8 @@ EXTERN void __kmpc_kernel_init(int Threa
   ASSERT0(LT_FUSSY, RequiresOMPRuntime,
           "Generic always requires initialized runtime.");
   setExecutionParameters(Generic, RuntimeInitialized);
+  for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
+    parallelLevel[I] = 0;
 
   int threadIdInBlock = GetThreadIdInBlock();
   ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
@@ -91,32 +93,32 @@ EXTERN void __kmpc_spmd_kernel_init(int
                                     int16_t RequiresDataSharing) {
   PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
 
+  setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
+                                                  : RuntimeUninitialized);
+  int threadId = GetThreadIdInBlock();
+  if (threadId == 0) {
+    usedSlotIdx = smid() % MAX_SM;
+    parallelLevel[0] =
+        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+  } else if (GetLaneId() == 0) {
+    parallelLevel[GetWarpId()] =
+        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+  }
   if (!RequiresOMPRuntime) {
-    // If OMP runtime is not required don't initialize OMP state.
-    setExecutionParameters(Spmd, RuntimeUninitialized);
-    if (GetThreadIdInBlock() == 0) {
-      usedSlotIdx = smid() % MAX_SM;
-      parallelLevel[0] = 0;
-    } else if (GetLaneId() == 0) {
-      parallelLevel[GetWarpId()] = 0;
-    }
+    // Runtime is not required - exit.
     __SYNCTHREADS();
     return;
   }
-  setExecutionParameters(Spmd, RuntimeInitialized);
 
   //
   // Team Context Initialization.
   //
   // In SPMD mode there is no master thread so use any cuda thread for team
   // context initialization.
-  int threadId = GetThreadIdInBlock();
   if (threadId == 0) {
     // Get a state object from the queue.
-    int slot = smid() % MAX_SM;
-    usedSlotIdx = slot;
     omptarget_nvptx_threadPrivateContext =
-        omptarget_nvptx_device_State[slot].Dequeue();
+        omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
 
     omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
     omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
@@ -148,7 +150,7 @@ EXTERN void __kmpc_spmd_kernel_init(int
         "%d threads\n",
         (int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());
 
-  if (RequiresDataSharing && threadId % WARPSIZE == 0) {
+  if (RequiresDataSharing && GetLaneId() == 0) {
     // Warp master innitializes data sharing environment.
     unsigned WID = threadId / WARPSIZE;
     __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h Thu May  2 13:05:01 2019
@@ -44,6 +44,8 @@
 #define MAX_SM 16
 #endif
 
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
 ////////////////////////////////////////////////////////////////////////////////
 // algo options
 ////////////////////////////////////////////////////////////////////////////////

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Thu May  2 13:05:01 2019
@@ -311,6 +311,7 @@ EXTERN bool __kmpc_kernel_parallel(void
           (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
 
     isActive = true;
+    IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1);
   }
 
   return isActive;
@@ -327,6 +328,8 @@ EXTERN void __kmpc_kernel_end_parallel()
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());
+
+  DecParallelLevel(currTaskDescr->ThreadsInTeam() != 1);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -336,16 +339,11 @@ EXTERN void __kmpc_kernel_end_parallel()
 EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
 
+  IncParallelLevel(/*ActiveParallel=*/false);
+
   if (checkRuntimeUninitialized(loc)) {
     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
             "Expected SPMD mode with uninitialized runtime.");
-    unsigned tnum = __ACTIVEMASK();
-    int leader = __ffs(tnum) - 1;
-    __SHFL_SYNC(tnum, leader, leader);
-    if (GetLaneId() == leader)
-      ++parallelLevel[GetWarpId()];
-    __SHFL_SYNC(tnum, leader, leader);
-
     return;
   }
 
@@ -381,15 +379,11 @@ EXTERN void __kmpc_end_serialized_parall
                                            uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
 
+  DecParallelLevel(/*ActiveParallel=*/false);
+
   if (checkRuntimeUninitialized(loc)) {
     ASSERT0(LT_FUSSY, checkSPMDMode(loc),
             "Expected SPMD mode with uninitialized runtime.");
-    unsigned tnum = __ACTIVEMASK();
-    int leader = __ffs(tnum) - 1;
-    __SHFL_SYNC(tnum, leader, leader);
-    if (GetLaneId() == leader)
-      --parallelLevel[GetWarpId()];
-    __SHFL_SYNC(tnum, leader, leader);
     return;
   }
 
@@ -408,21 +402,7 @@ EXTERN void __kmpc_end_serialized_parall
 EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
   PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
 
-  if (checkRuntimeUninitialized(loc)) {
-    ASSERT0(LT_FUSSY, checkSPMDMode(loc),
-            "Expected SPMD mode with uninitialized runtime.");
-    return parallelLevel[GetWarpId()] + 1;
-  }
-
-  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  if (currTaskDescr->InL2OrHigherParallelRegion())
-    return 2;
-  else if (currTaskDescr->InParallelRegion())
-    return 1;
-  else
-    return 0;
+  return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
 }
 
 // This kmpc call returns the thread id across all teams. It's value is
@@ -431,8 +411,7 @@ EXTERN uint16_t __kmpc_parallel_level(km
 // of this call.
 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
   int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
-  return GetOmpThreadId(tid, checkSPMDMode(loc),
-                        checkRuntimeUninitialized(loc));
+  return GetOmpThreadId(tid, checkSPMDMode(loc));
 }
 
 ////////////////////////////////////////////////////////////////////////////////

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu Thu May  2 13:05:01 2019
@@ -21,8 +21,7 @@ EXTERN
 int32_t __gpu_block_reduce() {
   bool isSPMDExecutionMode = isSPMDMode();
   int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  int nt =
-      GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
+  int nt = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
   if (nt != blockDim.x)
     return 0;
   unsigned tnum = __ACTIVEMASK();
@@ -40,9 +39,7 @@ int32_t __kmpc_reduce_gpu(kmp_Ident *loc
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   int numthread;
   if (currTaskDescr->IsParallelConstruct()) {
-    numthread =
-        GetNumberOfOmpThreads(threadId, checkSPMDMode(loc),
-                              checkRuntimeUninitialized(loc));
+    numthread = GetNumberOfOmpThreads(threadId, checkSPMDMode(loc));
   } else {
     numthread = GetNumberOfOmpTeams();
   }
@@ -150,8 +147,8 @@ static int32_t nvptx_parallel_reduce_now
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
   uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+  uint32_t NumThreads =
+      GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode);
   if (NumThreads == 1)
     return 1;
   /*
@@ -236,10 +233,9 @@ static int32_t nvptx_parallel_reduce_now
 EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      /*isSPMDExecutionMode=*/isSPMDMode(),
-      /*isRuntimeUninitialized=*/isRuntimeUninitialized());
+  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+                                      reduce_data, shflFct, cpyFct,
+                                      isSPMDMode(), isRuntimeUninitialized());
 }
 
 EXTERN
@@ -256,36 +252,35 @@ EXTERN
 int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
-                                      reduce_data, shflFct, cpyFct,
-                                      /*isSPMDExecutionMode=*/true,
-                                      /*isRuntimeUninitialized=*/true);
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
 }
 
 EXTERN
 int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
-  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
-                                      reduce_data, shflFct, cpyFct,
-                                      /*isSPMDExecutionMode=*/false,
-                                      /*isRuntimeUninitialized=*/true);
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
 }
 
 INLINE
-static int32_t nvptx_teams_reduce_nowait(
-    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
-    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
-    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
-    bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
+static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                         size_t reduce_size, void *reduce_data,
+                                         kmp_ShuffleReductFctPtr shflFct,
+                                         kmp_InterWarpCopyFctPtr cpyFct,
+                                         kmp_CopyToScratchpadFctPtr scratchFct,
+                                         kmp_LoadReduceFctPtr ldFct,
+                                         bool isSPMDExecutionMode) {
   uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
   // In non-generic mode all workers participate in the teams reduction.
   // In generic mode only the team master participates in the teams
   // reduction because the workers are waiting for parallel work.
   uint32_t NumThreads =
       isSPMDExecutionMode
-          ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
-                                  isRuntimeUninitialized)
+          ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
           : /*Master thread only*/ 1;
   uint32_t TeamId = GetBlockIdInKernel();
   uint32_t NumTeams = GetNumberOfBlocksInKernel();
@@ -406,10 +401,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait
                                          kmp_InterWarpCopyFctPtr cpyFct,
                                          kmp_CopyToScratchpadFctPtr scratchFct,
                                          kmp_LoadReduceFctPtr ldFct) {
-  return nvptx_teams_reduce_nowait(
-      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
-      scratchFct, ldFct, /*isSPMDExecutionMode=*/isSPMDMode(),
-      /*isRuntimeUninitialized=*/isRuntimeUninitialized());
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, isSPMDMode());
 }
 
 EXTERN
@@ -419,9 +413,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait
     kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
   return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
                                    reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct,
-                                   /*isSPMDExecutionMode=*/true,
-                                   /*isRuntimeUninitialized=*/true);
+                                   ldFct, /*isSPMDExecutionMode=*/true);
 }
 
 EXTERN
@@ -431,9 +423,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait
     kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
   return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
                                    reduce_data, shflFct, cpyFct, scratchFct,
-                                   ldFct,
-                                   /*isSPMDExecutionMode=*/false,
-                                   /*isRuntimeUninitialized=*/true);
+                                   ldFct, /*isSPMDExecutionMode=*/false);
 }
 
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
@@ -484,8 +474,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce
   // reduction because the workers are waiting for parallel work.
   uint32_t NumThreads =
       checkSPMDMode(loc)
-          ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
-                                  checkRuntimeUninitialized(loc))
+          ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
           : /*Master thread only*/ 1;
   uint32_t TeamId = GetBlockIdInKernel();
   uint32_t NumTeams = GetNumberOfBlocksInKernel();

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h Thu May  2 13:05:01 2019
@@ -49,15 +49,14 @@ INLINE int GetMasterThreadID();
 INLINE int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
-                          bool isRuntimeUninitialized); // omp_thread_num
+INLINE int GetOmpThreadId(int threadId,
+                          bool isSPMDExecutionMode);    // omp_thread_num
 INLINE int GetOmpTeamId();                              // omp_team_num
 
 // get OpenMP number of threads and team
-INLINE int
-GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
-                      bool isRuntimeUninitialized); // omp_num_threads
-INLINE int GetNumberOfOmpTeams();                   // omp_num_teams
+INLINE int GetNumberOfOmpThreads(int threadId,
+                                 bool isSPMDExecutionMode); // omp_num_threads
+INLINE int GetNumberOfOmpTeams();                           // omp_num_teams
 
 // get OpenMP number of procs
 INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
@@ -66,6 +65,10 @@ INLINE int GetNumberOfProcsInDevice(bool
 // masters
 INLINE int IsTeamMaster(int ompThreadId);
 
+// Parallel level
+INLINE void IncParallelLevel(bool ActiveParallel);
+INLINE void DecParallelLevel(bool ActiveParallel);
+
 ////////////////////////////////////////////////////////////////////////////////
 // Memory
 ////////////////////////////////////////////////////////////////////////////////

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h Thu May  2 13:05:01 2019
@@ -149,40 +149,29 @@ INLINE int GetLogicalThreadIdInBlock(boo
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
-                          bool isRuntimeUninitialized) {
+INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
   // omp_thread_num
   int rc;
-
-  if (isRuntimeUninitialized) {
-    ASSERT0(LT_FUSSY, isSPMDExecutionMode,
-            "Uninitialized runtime with non-SPMD mode.");
-    // For level 2 parallelism all parallel regions are executed sequentially.
-    if (parallelLevel[GetWarpId()] > 0)
-      rc = 0;
-    else
-      rc = GetThreadIdInBlock();
+  if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
+    rc = 0;
+  } else if (isSPMDExecutionMode) {
+    rc = GetThreadIdInBlock();
   } else {
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
     rc = currTaskDescr->ThreadId();
   }
   return rc;
 }
 
-INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
-                                 bool isRuntimeUninitialized) {
+INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode) {
   // omp_num_threads
   int rc;
-
-  if (isRuntimeUninitialized) {
-    ASSERT0(LT_FUSSY, isSPMDExecutionMode,
-            "Uninitialized runtime with non-SPMD mode.");
-    // For level 2 parallelism all parallel regions are executed sequentially.
-    if (parallelLevel[GetWarpId()] > 0)
-      rc = 1;
-    else
-      rc = GetNumberOfThreadsInBlock();
+  if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
+    rc = 1;
+  } else if (isSPMDExecutionMode) {
+    rc = GetNumberOfThreadsInBlock();
   } else {
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
@@ -212,6 +201,31 @@ INLINE int GetNumberOfOmpTeams() {
 INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 
 ////////////////////////////////////////////////////////////////////////////////
+// Parallel level
+
+INLINE void IncParallelLevel(bool ActiveParallel) {
+  unsigned tnum = __ACTIVEMASK();
+  int leader = __ffs(tnum) - 1;
+  __SHFL_SYNC(tnum, leader, leader);
+  if (GetLaneId() == leader) {
+    parallelLevel[GetWarpId()] +=
+        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+  }
+  __SHFL_SYNC(tnum, leader, leader);
+}
+
+INLINE void DecParallelLevel(bool ActiveParallel) {
+  unsigned tnum = __ACTIVEMASK();
+  int leader = __ffs(tnum) - 1;
+  __SHFL_SYNC(tnum, leader, leader);
+  if (GetLaneId() == leader) {
+    parallelLevel[GetWarpId()] -=
+        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+  }
+  __SHFL_SYNC(tnum, leader, leader);
+}
+
+////////////////////////////////////////////////////////////////////////////////
 // get OpenMP number of procs
 
 // Get the number of processors in the device.

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu Thu May  2 13:05:01 2019
@@ -48,8 +48,8 @@ EXTERN void __kmpc_barrier(kmp_Ident *lo
     tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-    int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
-        tid, checkSPMDMode(loc_ref), /*isRuntimeUninitialized=*/false);
+    int numberOfActiveOMPThreads =
+        GetNumberOfOmpThreads(tid, checkSPMDMode(loc_ref));
     if (numberOfActiveOMPThreads > 1) {
       if (checkSPMDMode(loc_ref)) {
         __kmpc_barrier_simple_spmd(loc_ref, tid);

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c Thu May  2 13:05:01 2019
@@ -5,6 +5,7 @@
 
 const int MaxThreads = 1024;
 const int NumThreads = 64;
+const int NumThreads1 = 1;
 
 int main(int argc, char *argv[]) {
   int inParallel = -1, numThreads = -1, threadNum = -1;
@@ -14,20 +15,20 @@ int main(int argc, char *argv[]) {
     check1[i] = check2[i] = 0;
   }
 
-  #pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
   {
     inParallel = omp_in_parallel();
     numThreads = omp_get_num_threads();
     threadNum = omp_get_thread_num();
 
-    // Expecting active parallel region.
-    #pragma omp parallel num_threads(NumThreads)
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads)
     {
       int id = omp_get_thread_num();
       check1[id] += omp_get_num_threads() + omp_in_parallel();
 
-      // Expecting serialized parallel region.
-      #pragma omp parallel
+// Expecting serialized parallel region.
+#pragma omp parallel
       {
         // Expected to be 1.
         int nestedInParallel = omp_in_parallel();
@@ -35,7 +36,7 @@ int main(int argc, char *argv[]) {
         int nestedNumThreads = omp_get_num_threads();
         // Expected to be 0.
         int nestedThreadNum = omp_get_thread_num();
-        #pragma omp atomic
+#pragma omp atomic
         check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
       }
     }
@@ -52,7 +53,8 @@ int main(int argc, char *argv[]) {
     int Expected = NumThreads + 1;
     if (i < NumThreads) {
       if (check1[i] != Expected) {
-        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, check1[i]);
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+               check1[i]);
       }
     } else if (check1[i] != 0) {
       printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
@@ -65,6 +67,68 @@ int main(int argc, char *argv[]) {
       }
     } else if (check2[i] != 0) {
       printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+  }
+
+  inParallel = -1;
+  numThreads = -1;
+  threadNum = -1;
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = 0;
+  }
+
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+  {
+    inParallel = omp_in_parallel();
+    numThreads = omp_get_num_threads();
+    threadNum = omp_get_thread_num();
+
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads1)
+    {
+      int id = omp_get_thread_num();
+      check1[id] += omp_get_num_threads() + omp_in_parallel();
+
+// Expecting serialized parallel region.
+#pragma omp parallel
+      {
+        // Expected to be 0.
+        int nestedInParallel = omp_in_parallel();
+        // Expected to be 1.
+        int nestedNumThreads = omp_get_num_threads();
+        // Expected to be 0.
+        int nestedThreadNum = omp_get_thread_num();
+#pragma omp atomic
+        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
+      }
+    }
+  }
+
+  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
+  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
+         inParallel, numThreads, threadNum);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    // Check that all threads reported
+    // omp_get_num_threads() = 1, omp_in_parallel() = 0.
+    int Expected = 1;
+    if (i < NumThreads1) {
+      if (check1[i] != Expected) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+               check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    // Check serialized parallel region.
+    if (i < NumThreads1) {
+      if (check2[i] != 1) {
+        printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
     }
   }
 




More information about the Openmp-commits mailing list