[Openmp-commits] [openmp] r335469 - [OPENMP, NVPTX] Fixes for NVPTX RTL

Mon Jun 25 06:43:35 PDT 2018

Author: abataev
Date: Mon Jun 25 06:43:35 2018
New Revision: 335469

URL: http://llvm.org/viewvc/llvm-project?rev=335469&view=rev
Log:
[OPENMP, NVPTX] Fixes for NVPTX RTL

Summary:
Patch fixes several problems in the implementation of NVPTX RTL.
1. Detection of the last iteration for loops with static scheduling, no chunks.
2. Fixes reductions for the serialized parallel constructs.
3. Fixes handling of the barriers.

Reviewers: grokos

Reviewed By: grokos

Subscribers: Hahnfeld, guansong, openmp-commits

Differential Revision: https://reviews.llvm.org/D48480

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=335469&r1=335468&r2=335469&view=diff
==============================================================================

--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Mon Jun 25 06:43:35 2018
@@ -86,7 +86,7 @@ public:
 
     T inputUb = ub;
     ub = lb + chunk - 1; // Clang uses i <= ub
-    last = ub == inputUb;
+    last = lb <= inputUb && inputUb <= ub;
     stride = loopSize; // make sure we only do 1 chunk per warp
   }
 

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu?rev=335469&r1=335468&r2=335469&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu Mon Jun 25 06:43:35 2018
@@ -161,6 +161,11 @@ int32_t nvptx_parallel_reduce_nowait(int
                                      kmp_InterWarpCopyFctPtr cpyFct,
                                      bool isSPMDExecutionMode,
                                      bool isRuntimeUninitialized = false) {
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t NumThreads = GetNumberOfOmpThreads(
+      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+  if (NumThreads == 1)
+    return 1;
   /*
    * This reduce function handles reduction within a team. It handles
    * parallel regions in both L1 and L2 parallelism levels. It also
@@ -173,9 +178,6 @@ int32_t nvptx_parallel_reduce_nowait(int
    */
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
   uint32_t WarpId = BlockThreadId / WARPSIZE;
 
@@ -219,10 +221,6 @@ int32_t nvptx_parallel_reduce_nowait(int
                                     // early.
     return gpu_irregular_simd_reduce(reduce_data, shflFct);
 
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
-
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
   //

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu?rev=335469&r1=335468&r2=335469&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu Mon Jun 25 06:43:35 2018
@@ -35,40 +35,46 @@ EXTERN void __kmpc_end_ordered(kmp_Inden
 
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
-  __syncthreads();
+  __kmpc_barrier(loc_ref, tid);
   PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
   return 0;
 }
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
-  tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-  if (!currTaskDescr->InL2OrHigherParallelRegion()) {
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  if (isSPMDMode()) {
+    __kmpc_barrier_simple_spmd(loc_ref, tid);
+  } else if (isRuntimeUninitialized()) {
+    __kmpc_barrier_simple_generic(loc_ref, tid);
+  } else {
+    tid = GetLogicalThreadIdInBlock();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+    if (!currTaskDescr->InL2OrHigherParallelRegion()) {
+      int numberOfActiveOMPThreads =
+          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    // On Volta and newer architectures we require that all lanes in
-    // a warp (at least, all present for the kernel launch) participate in the
-    // barrier.  This is enforced when launching the parallel region.  An
-    // exception is when there are < WARPSIZE workers.  In this case only 1
-    // worker is started, so we don't need a barrier.
-    if (numberOfActiveOMPThreads > 1) {
+      // On Volta and newer architectures we require that all lanes in
+      // a warp (at least, all present for the kernel launch) participate in the
+      // barrier.  This is enforced when launching the parallel region.  An
+      // exception is when there are < WARPSIZE workers.  In this case only 1
+      // worker is started, so we don't need a barrier.
+      if (numberOfActiveOMPThreads > 1) {
 #endif
-      // The #threads parameter must be rounded up to the WARPSIZE.
-      int threads =
-          WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+        // The #threads parameter must be rounded up to the WARPSIZE.
+        int threads =
+            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
 
-      PRINT(LD_SYNC,
-            "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-            numberOfActiveOMPThreads, threads);
-      // Barrier #1 is for synchronization among active threads.
-      named_sync(L1_BARRIER, threads);
+        PRINT(LD_SYNC,
+              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+              numberOfActiveOMPThreads, threads);
+        // Barrier #1 is for synchronization among active threads.
+        named_sync(L1_BARRIER, threads);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    } // numberOfActiveOMPThreads > 1
+      } // numberOfActiveOMPThreads > 1
 #endif
+    }
+    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
-  PRINT0(LD_SYNC, "completed kmpc_barrier\n");
 }
 
 // Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0