[Openmp-commits] [openmp] r337691 - [OPNEMP, NVPTX] Fixed sychronization construct + code cleanup.
Alexey Bataev via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jul 23 06:52:12 PDT 2018
Author: abataev
Date: Mon Jul 23 06:52:12 2018
New Revision: 337691
URL: http://llvm.org/viewvc/llvm-project?rev=337691&view=rev
Log:
[OPNEMP, NVPTX] Fixed sychronization construct + code cleanup.
Summary:
1. Fixed internal problem in `__kmpc_barrier` function: SPMD mode
synchronization function should be called only in L1 parallel level.
2. Removed some extra code for synchronization inside of the code, used
`__kmpc_barrier` instead.
3. Some code cleanup.
Reviewers: gtbercea, grokos
Subscribers: openmp-commits
Differential Revision: https://reviews.llvm.org/D49564
Modified:
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=337691&r1=337690&r2=337691&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Mon Jul 23 06:52:12 2018
@@ -240,12 +240,8 @@ public:
// Process schedule.
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
- if (OrderedSchedule(schedule)) {
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
- }
+ if (OrderedSchedule(schedule))
+ __kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
(long)tnum, P64(tripCount), schedule);
@@ -338,10 +334,7 @@ public:
omptarget_nvptx_threadPrivateContext->Stride(tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
+ __kmpc_barrier(loc, threadId);
// save sched state
int teamId = GetOmpTeamId();
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
@@ -352,10 +345,7 @@ public:
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
}
- if (isSPMDMode())
- __syncthreads();
- else
- __kmpc_barrier(loc, threadId);
+ __kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
", chunk %" PRIu64 "\n",
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu?rev=337691&r1=337690&r2=337691&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu Mon Jul 23 06:52:12 2018
@@ -25,9 +25,8 @@ int32_t __gpu_block_reduce() {
if (nt != blockDim.x)
return 0;
unsigned tnum = __ACTIVEMASK();
- if (tnum != (~0x0)) { // assume swapSize is 32
+ if (tnum != (~0x0)) // assume swapSize is 32
return 0;
- }
return 1;
}
@@ -48,32 +47,21 @@ int32_t __kmpc_reduce_gpu(kmp_Indent *lo
if (numthread == 1)
return 1;
- else if (!__gpu_block_reduce())
+ if (!__gpu_block_reduce())
return 2;
- else {
- if (threadIdx.x == 0)
- return 1;
- else
- return 0;
- }
+ if (threadIdx.x == 0)
+ return 1;
+ return 0;
}
EXTERN
int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
- if (threadIdx.x == 0) {
- return 2;
- } else {
- return 0;
- }
+ return threadIdx.x == 0 ? 2 : 0;
}
EXTERN
int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
- if (threadIdx.x % 32 == 0) {
- return 1;
- } else {
- return 0;
- }
+ return (threadIdx.x % 32 == 0) ? 1 : 0;
}
EXTERN
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h?rev=337691&r1=337690&r2=337691&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h Mon Jul 23 06:52:12 2018
@@ -155,8 +155,7 @@ INLINE int IsTeamMaster(int ompThreadId)
INLINE int GetNumberOfProcsInDevice() {
if (isGenericMode())
return GetNumberOfWorkersInTeam();
- else
- return GetNumberOfThreadsInBlock();
+ return GetNumberOfThreadsInBlock();
}
INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu?rev=337691&r1=337690&r2=337691&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu Mon Jul 23 06:52:12 2018
@@ -41,25 +41,21 @@ EXTERN int32_t __kmpc_cancel_barrier(kmp
}
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
- if (isSPMDMode()) {
- __kmpc_barrier_simple_spmd(loc_ref, tid);
- } else if (isRuntimeUninitialized()) {
- __kmpc_barrier_simple_generic(loc_ref, tid);
+ if (isRuntimeUninitialized()) {
+ if (isSPMDMode())
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ else
+ __kmpc_barrier_simple_generic(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
- if (!currTaskDescr->InL2OrHigherParallelRegion()) {
- int numberOfActiveOMPThreads =
- GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- // On Volta and newer architectures we require that all lanes in
- // a warp (at least, all present for the kernel launch) participate in the
- // barrier. This is enforced when launching the parallel region. An
- // exception is when there are < WARPSIZE workers. In this case only 1
- // worker is started, so we don't need a barrier.
- if (numberOfActiveOMPThreads > 1) {
-#endif
+ int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+ tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
+ if (numberOfActiveOMPThreads > 1) {
+ if (isSPMDMode()) {
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ } else {
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
@@ -69,10 +65,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *l
numberOfActiveOMPThreads, threads);
// Barrier #1 is for synchronization among active threads.
named_sync(L1_BARRIER, threads);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- } // numberOfActiveOMPThreads > 1
-#endif
- }
+ }
+ } // numberOfActiveOMPThreads > 1
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
}
More information about the Openmp-commits
mailing list