[Openmp-commits] [openmp] r360584 - [OPENMP][NVPTX]Simplify handling of thread limit, NFC.

Alexey Bataev via Openmp-commits openmp-commits at lists.llvm.org
Mon May 13 07:21:46 PDT 2019


Author: abataev
Date: Mon May 13 07:21:46 2019
New Revision: 360584

URL: http://llvm.org/viewvc/llvm-project?rev=360584&view=rev
Log:
[OPENMP][NVPTX]Simplify handling of thread limit, NFC.

Summary:
Patch improves performance of the full runtime mode by moving
threads limit counter to the shared memory. It also allows to save
global memory.

Reviewers: grokos, kkwli0, gtbercea

Subscribers: guansong, jdoerfert, openmp-commits, caomhin

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D61801

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu Mon May 13 07:21:46 2019
@@ -37,10 +37,8 @@ EXTERN void omp_set_num_threads(int num)
   PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
   if (num <= 0) {
     WARNING0(LW_INPUT, "expected positive num; ignore\n");
-  } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr =
-        getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
-    currTaskDescr->NThreads() = num;
+  } else if (parallelLevel[GetWarpId()] == 0) {
+    nThreads = num;
   }
 }
 
@@ -54,12 +52,10 @@ EXTERN int omp_get_max_threads(void) {
   if (parallelLevel[GetWarpId()] > 0)
     // We're already in parallel region.
     return 1; // default is 1 thread avail
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
-  ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
-          "Should no be in the parallel region");
   // Not currently in a parallel region, return what was set.
-  int rc = currTaskDescr->NThreads();
+  int rc = 1;
+  if (parallelLevel[GetWarpId()] == 0)
+    rc = nThreads;
   ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
   PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
   return rc;
@@ -175,7 +171,7 @@ EXTERN int omp_get_ancestor_thread_num(i
                 (int)currTaskDescr->InParallelRegion(), (int)sched,
                 currTaskDescr->RuntimeChunkSize(),
                 (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
-                (int)currTaskDescr->NThreads());
+                (int)nThreads);
         }
 
         if (currTaskDescr->IsParallelConstruct()) {

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu Mon May 13 07:21:46 2019
@@ -34,6 +34,7 @@ __device__ __shared__ uint32_t usedSlotI
 __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
 __device__ __shared__ uint16_t threadLimit;
 __device__ __shared__ uint16_t threadsInTeam;
+__device__ __shared__ uint16_t nThreads;
 // Pointer to this team's OpenMP state object
 __device__ __shared__
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Mon May 13 07:21:46 2019
@@ -63,7 +63,7 @@ EXTERN void __kmpc_kernel_init(int Threa
 
   // init team context
   omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false);
+  currTeamDescr.InitTeamDescr();
   // this thread will start execution... has to update its task ICV
   // to point to the level zero task ICV. That ICV was init in
   // InitTeamDescr()
@@ -73,7 +73,7 @@ EXTERN void __kmpc_kernel_init(int Threa
   // set number of threads and thread limit in team to started value
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-  currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
+  nThreads = GetNumberOfWorkersInTeam();
   threadLimit = ThreadLimit;
 }
 
@@ -123,7 +123,7 @@ EXTERN void __kmpc_spmd_kernel_init(int
     omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
     omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
     // init team context
-    currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true);
+    currTeamDescr.InitTeamDescr();
   }
   // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
   __SYNCTHREADS();

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Mon May 13 07:21:46 2019
@@ -164,7 +164,6 @@ public:
   }
   INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
   // methods for other fields
-  INLINE uint16_t &NThreads() { return items.nthreads; }
   INLINE uint16_t &ThreadId() { return items.threadId; }
   INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
   INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
@@ -172,7 +171,7 @@ public:
     prev = taskDescr;
   }
   // init & copy
-  INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
+  INLINE void InitLevelZeroTaskDescr();
   INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
   INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
   INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
@@ -208,7 +207,6 @@ private:
   struct TaskDescr_items {
     uint8_t flags; // 6 bit used (see flag above)
     uint8_t unused;
-    uint16_t nthreads;         // thread num for subsequent parallel regions
     uint16_t threadId;         // thread id
     uint64_t runtimeChunkSize; // runtime chunk size
   } items;
@@ -249,7 +247,7 @@ public:
   INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
 
   // init
-  INLINE void InitTeamDescr(bool isSPMDExecutionMode);
+  INLINE void InitTeamDescr();
 
   INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
     // If this is invoked by the master thread of the master warp then intialize
@@ -404,6 +402,7 @@ extern __device__ __shared__ uint8_t
     parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
 extern __device__ __shared__ uint16_t threadLimit;
 extern __device__ __shared__ uint16_t threadsInTeam;
+extern __device__ __shared__ uint16_t nThreads;
 extern __device__ __shared__
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
 

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h Mon May 13 07:21:46 2019
@@ -31,7 +31,7 @@ INLINE void omptarget_nvptx_TaskDescr::S
 }
 
 INLINE void
-omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
   // slow method
   // flag:
   //   default sched is static,
@@ -39,8 +39,6 @@ omptarget_nvptx_TaskDescr::InitLevelZero
   //   not in parallel
 
   items.flags = 0;
-  items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
-  ;                                // threads: whatever was alloc by kernel
   items.threadId = 0;         // is master
   items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
 }
@@ -57,7 +55,6 @@ INLINE void omptarget_nvptx_TaskDescr::I
 
   items.flags =
       TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-  items.nthreads = 0; // # threads for subsequent parallel region
   items.threadId =
       GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
   items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
@@ -173,8 +170,8 @@ omptarget_nvptx_ThreadPrivateContext::In
 // Team Descriptor
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) {
-  levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode);
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+  levelZeroTaskDescr.InitLevelZeroTaskDescr();
 }
 
 ////////////////////////////////////////////////////////////////////////////////

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Mon May 13 07:21:46 2019
@@ -249,8 +249,8 @@ EXTERN void __kmpc_kernel_prepare_parall
   uint16_t &NumThreadsClause =
       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
 
-  uint16_t NumThreads = determineNumberOfThreads(
-      NumThreadsClause, currTaskDescr->NThreads(), threadLimit);
+  uint16_t NumThreads =
+      determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
 
   if (NumThreadsClause != 0) {
     // Reset request to avoid propagating to successive #parallel
@@ -308,7 +308,7 @@ EXTERN bool __kmpc_kernel_parallel(void
     PRINT(LD_PAR,
           "thread will execute parallel region with id %d in a team of "
           "%d threads\n",
-          (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
+          (int)newTaskDescr->ThreadId(), (int)nThreads);
 
     isActive = true;
     IncParallelLevel(threadsInTeam != 1);




More information about the Openmp-commits mailing list