[Openmp-commits] [openmp] r360584 - [OPENMP][NVPTX]Simplify handling of thread limit, NFC.
Alexey Bataev via Openmp-commits
openmp-commits at lists.llvm.org
Mon May 13 07:21:46 PDT 2019
Author: abataev
Date: Mon May 13 07:21:46 2019
New Revision: 360584
URL: http://llvm.org/viewvc/llvm-project?rev=360584&view=rev
Log:
[OPENMP][NVPTX]Simplify handling of thread limit, NFC.
Summary:
Patch improves performance of the full runtime mode by moving
threads limit counter to the shared memory. It also allows to save
global memory.
Reviewers: grokos, kkwli0, gtbercea
Subscribers: guansong, jdoerfert, openmp-commits, caomhin
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D61801
Modified:
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu Mon May 13 07:21:46 2019
@@ -37,10 +37,8 @@ EXTERN void omp_set_num_threads(int num)
PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
if (num <= 0) {
WARNING0(LW_INPUT, "expected positive num; ignore\n");
- } else {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
- currTaskDescr->NThreads() = num;
+ } else if (parallelLevel[GetWarpId()] == 0) {
+ nThreads = num;
}
}
@@ -54,12 +52,10 @@ EXTERN int omp_get_max_threads(void) {
if (parallelLevel[GetWarpId()] > 0)
// We're already in parallel region.
return 1; // default is 1 thread avail
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
- ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
- "Should no be in the parallel region");
// Not currently in a parallel region, return what was set.
- int rc = currTaskDescr->NThreads();
+ int rc = 1;
+ if (parallelLevel[GetWarpId()] == 0)
+ rc = nThreads;
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
return rc;
@@ -175,7 +171,7 @@ EXTERN int omp_get_ancestor_thread_num(i
(int)currTaskDescr->InParallelRegion(), (int)sched,
currTaskDescr->RuntimeChunkSize(),
(int)currTaskDescr->ThreadId(), (int)threadsInTeam,
- (int)currTaskDescr->NThreads());
+ (int)nThreads);
}
if (currTaskDescr->IsParallelConstruct()) {
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu Mon May 13 07:21:46 2019
@@ -34,6 +34,7 @@ __device__ __shared__ uint32_t usedSlotI
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
__device__ __shared__ uint16_t threadLimit;
__device__ __shared__ uint16_t threadsInTeam;
+__device__ __shared__ uint16_t nThreads;
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Mon May 13 07:21:46 2019
@@ -63,7 +63,7 @@ EXTERN void __kmpc_kernel_init(int Threa
// init team context
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
- currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false);
+ currTeamDescr.InitTeamDescr();
// this thread will start execution... has to update its task ICV
// to point to the level zero task ICV. That ICV was init in
// InitTeamDescr()
@@ -73,7 +73,7 @@ EXTERN void __kmpc_kernel_init(int Threa
// set number of threads and thread limit in team to started value
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
- currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
+ nThreads = GetNumberOfWorkersInTeam();
threadLimit = ThreadLimit;
}
@@ -123,7 +123,7 @@ EXTERN void __kmpc_spmd_kernel_init(int
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
// init team context
- currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true);
+ currTeamDescr.InitTeamDescr();
}
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Mon May 13 07:21:46 2019
@@ -164,7 +164,6 @@ public:
}
INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
// methods for other fields
- INLINE uint16_t &NThreads() { return items.nthreads; }
INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
@@ -172,7 +171,7 @@ public:
prev = taskDescr;
}
// init & copy
- INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
+ INLINE void InitLevelZeroTaskDescr();
INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
@@ -208,7 +207,6 @@ private:
struct TaskDescr_items {
uint8_t flags; // 6 bit used (see flag above)
uint8_t unused;
- uint16_t nthreads; // thread num for subsequent parallel regions
uint16_t threadId; // thread id
uint64_t runtimeChunkSize; // runtime chunk size
} items;
@@ -249,7 +247,7 @@ public:
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
// init
- INLINE void InitTeamDescr(bool isSPMDExecutionMode);
+ INLINE void InitTeamDescr();
INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
// If this is invoked by the master thread of the master warp then intialize
@@ -404,6 +402,7 @@ extern __device__ __shared__ uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__ uint16_t threadLimit;
extern __device__ __shared__ uint16_t threadsInTeam;
+extern __device__ __shared__ uint16_t nThreads;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h Mon May 13 07:21:46 2019
@@ -31,7 +31,7 @@ INLINE void omptarget_nvptx_TaskDescr::S
}
INLINE void
-omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
// slow method
// flag:
// default sched is static,
@@ -39,8 +39,6 @@ omptarget_nvptx_TaskDescr::InitLevelZero
// not in parallel
items.flags = 0;
- items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
- ; // threads: whatever was alloc by kernel
items.threadId = 0; // is master
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
}
@@ -57,7 +55,6 @@ INLINE void omptarget_nvptx_TaskDescr::I
items.flags =
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
- items.nthreads = 0; // # threads for subsequent parallel region
items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
@@ -173,8 +170,8 @@ omptarget_nvptx_ThreadPrivateContext::In
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) {
- levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode);
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+ levelZeroTaskDescr.InitLevelZeroTaskDescr();
}
////////////////////////////////////////////////////////////////////////////////
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=360584&r1=360583&r2=360584&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Mon May 13 07:21:46 2019
@@ -249,8 +249,8 @@ EXTERN void __kmpc_kernel_prepare_parall
uint16_t &NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
- uint16_t NumThreads = determineNumberOfThreads(
- NumThreadsClause, currTaskDescr->NThreads(), threadLimit);
+ uint16_t NumThreads =
+ determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
if (NumThreadsClause != 0) {
// Reset request to avoid propagating to successive #parallel
@@ -308,7 +308,7 @@ EXTERN bool __kmpc_kernel_parallel(void
PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
- (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
+ (int)newTaskDescr->ThreadId(), (int)nThreads);
isActive = true;
IncParallelLevel(threadsInTeam != 1);
More information about the Openmp-commits
mailing list