[Openmp-commits] [openmp] r359341 - [OPENMP][NVPTX]Correctly handle L2 parallelism in SPMD mode.
Alexey Bataev via Openmp-commits
openmp-commits at lists.llvm.org
Fri Apr 26 12:30:34 PDT 2019
Author: abataev
Date: Fri Apr 26 12:30:34 2019
New Revision: 359341
URL: http://llvm.org/viewvc/llvm-project?rev=359341&view=rev
Log:
[OPENMP][NVPTX]Correctly handle L2 parallelism in SPMD mode.
Summary:
The parallelLevel counter must be on per-thread basis to fully support
L2+ parallelism, otherwise we may end up with undefined behavior.
Introduce the parallelLevel on per-warp basis using shared memory. It
allows to avoid the problems with the synchronization and allows fully
support L2+ parallelism in SPMD mode with no runtime.
Reviewers: gtbercea, grokos
Subscribers: guansong, jdoerfert, caomhin, kkwli0, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D60918
Modified:
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu Fri Apr 26 12:30:34 2019
@@ -165,7 +165,7 @@ EXTERN int omp_get_level(void) {
ASSERT0(LT_FUSSY, isSPMDMode(),
"Expected SPMD mode only with uninitialized runtime.");
// parallelLevel starts from 0, need to add 1 for correct level.
- return parallelLevel + 1;
+ return parallelLevel[GetWarpId()] + 1;
}
int level = 0;
omptarget_nvptx_TaskDescr *currTaskDescr =
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu Fri Apr 26 12:30:34 2019
@@ -31,7 +31,7 @@ __device__ omptarget_nvptx_SimpleMemoryM
__device__ __shared__ uint32_t usedMemIdx;
__device__ __shared__ uint32_t usedSlotIdx;
-__device__ __shared__ uint8_t parallelLevel;
+__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
// Pointer to this team's OpenMP state object
__device__ __shared__
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Fri Apr 26 12:30:34 2019
@@ -95,8 +95,10 @@ EXTERN void __kmpc_spmd_kernel_init(int
// If OMP runtime is not required don't initialize OMP state.
setExecutionParameters(Spmd, RuntimeUninitialized);
if (GetThreadIdInBlock() == 0) {
- parallelLevel = 0;
usedSlotIdx = smid() % MAX_SM;
+ parallelLevel[0] = 0;
+ } else if (GetLaneId() == 0) {
+ parallelLevel[GetWarpId()] = 0;
}
__SYNCTHREADS();
return;
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Fri Apr 26 12:30:34 2019
@@ -406,7 +406,8 @@ extern __device__ omptarget_nvptx_Simple
omptarget_nvptx_simpleMemoryManager;
extern __device__ __shared__ uint32_t usedMemIdx;
extern __device__ __shared__ uint32_t usedSlotIdx;
-extern __device__ __shared__ uint8_t parallelLevel;
+extern __device__ __shared__ uint8_t
+ parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Fri Apr 26 12:30:34 2019
@@ -339,10 +339,12 @@ EXTERN void __kmpc_serialized_parallel(k
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- __SYNCTHREADS();
- if (GetThreadIdInBlock() == 0)
- ++parallelLevel;
- __SYNCTHREADS();
+ unsigned tnum = __ACTIVEMASK();
+ int leader = __ffs(tnum) - 1;
+ __SHFL_SYNC(tnum, leader, leader);
+ if (GetLaneId() == leader)
+ ++parallelLevel[GetWarpId()];
+ __SHFL_SYNC(tnum, leader, leader);
return;
}
@@ -382,10 +384,12 @@ EXTERN void __kmpc_end_serialized_parall
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- __SYNCTHREADS();
- if (GetThreadIdInBlock() == 0)
- --parallelLevel;
- __SYNCTHREADS();
+ unsigned tnum = __ACTIVEMASK();
+ int leader = __ffs(tnum) - 1;
+ __SHFL_SYNC(tnum, leader, leader);
+ if (GetLaneId() == leader)
+ --parallelLevel[GetWarpId()];
+ __SHFL_SYNC(tnum, leader, leader);
return;
}
@@ -407,7 +411,7 @@ EXTERN uint16_t __kmpc_parallel_level(km
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
"Expected SPMD mode with uninitialized runtime.");
- return parallelLevel + 1;
+ return parallelLevel[GetWarpId()] + 1;
}
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h Fri Apr 26 12:30:34 2019
@@ -40,6 +40,8 @@ INLINE int GetThreadIdInBlock();
INLINE int GetBlockIdInKernel();
INLINE int GetNumberOfBlocksInKernel();
INLINE int GetNumberOfThreadsInBlock();
+INLINE unsigned GetWarpId();
+INLINE unsigned GetLaneId();
// get global ids to locate tread/team info (constant regardless of OMP)
INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h Fri Apr 26 12:30:34 2019
@@ -102,6 +102,10 @@ INLINE int GetNumberOfBlocksInKernel() {
INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
+
+INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
+
////////////////////////////////////////////////////////////////////////////////
//
// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
@@ -154,7 +158,7 @@ INLINE int GetOmpThreadId(int threadId,
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
- if (parallelLevel > 0)
+ if (parallelLevel[GetWarpId()] > 0)
rc = 0;
else
rc = GetThreadIdInBlock();
@@ -175,7 +179,7 @@ INLINE int GetNumberOfOmpThreads(int thr
ASSERT0(LT_FUSSY, isSPMDExecutionMode,
"Uninitialized runtime with non-SPMD mode.");
// For level 2 parallelism all parallel regions are executed sequentially.
- if (parallelLevel > 0)
+ if (parallelLevel[GetWarpId()] > 0)
rc = 1;
else
rc = GetNumberOfThreadsInBlock();
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp?rev=359341&r1=359340&r2=359341&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp Fri Apr 26 12:30:34 2019
@@ -6,24 +6,31 @@
int main(void) {
int isHost = -1;
int ParallelLevel1 = -1, ParallelLevel2 = -1;
+ int Count = 0;
#pragma omp target parallel for map(tofrom \
- : isHost, ParallelLevel1, ParallelLevel2)
+ : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
for (int J = 0; J < 10; ++J) {
#pragma omp critical
{
- isHost = (isHost < 0 || isHost == omp_is_initial_device())
- ? omp_is_initial_device()
- : 1;
- ParallelLevel1 =
- (ParallelLevel1 < 0 || ParallelLevel1 == 1) ? omp_get_level() : 2;
+ isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
+ ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
+ ? omp_get_level()
+ : ParallelLevel1;
}
- int L2;
-#pragma omp parallel for schedule(dynamic) lastprivate(L2)
- for (int I = 0; I < 10; ++I)
- L2 = omp_get_level();
+ if (omp_get_thread_num() > 5) {
+ int L2;
+#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
+ for (int I = 0; I < 10; ++I) {
+ L2 = omp_get_level();
+ Count += omp_get_level(); // (10-6)*10*2 = 80
+ }
#pragma omp critical
- ParallelLevel2 = (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : 1;
+ ParallelLevel2 =
+ (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
+ } else {
+ Count += omp_get_level(); // 6 * 1 = 6
+ }
}
if (isHost < 0) {
@@ -35,6 +42,10 @@ int main(void) {
// CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
ParallelLevel2);
+ // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
+ // level) + 6(num of iterations) * 1(par level)
+ // CHECK: Expected count = 86
+ printf("Expected count = %d\n", Count);
return isHost;
}
More information about the Openmp-commits
mailing list