[Openmp-commits] [PATCH] D65013: [OPENMP][NVPTX]Fix parallel level counter in Cuda 9.0.

Fri Jul 19 13:18:14 PDT 2019

ABataev created this revision.
ABataev added a reviewer: grokos.
Herald added subscribers: jdoerfert, jfb, guansong.
Herald added a project: OpenMP.

In Cuda 9.0 it is not guaranteed tat threads in the warps are
convergent. We need to use __syncwarp() function to force
synchronization of the threads within the warps and to guarantee that
memory ordering among threads in the warps.


Repository:
  rOMP OpenMP

https://reviews.llvm.org/D65013

Files:
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/supporti.h


Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===================================================================

--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -202,25 +202,31 @@
 // Parallel level
 
 INLINE void IncParallelLevel(bool ActiveParallel) {
-  unsigned tnum = __ACTIVEMASK();
-  int leader = __ffs(tnum) - 1;
-  __SHFL_SYNC(tnum, leader, leader);
-  if (GetLaneId() == leader) {
+  __SYNCWARP();
+  unsigned Active = __ACTIVEMASK();
+  unsigned LaneMaskLt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
+  unsigned Rank = __popc(Active & LaneMaskLt);
+  if (Rank == 0) {
     parallelLevel[GetWarpId()] +=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __threadfence();
   }
-  __SHFL_SYNC(tnum, leader, leader);
+  __SYNCWARP();
 }
 
 INLINE void DecParallelLevel(bool ActiveParallel) {
-  unsigned tnum = __ACTIVEMASK();
-  int leader = __ffs(tnum) - 1;
-  __SHFL_SYNC(tnum, leader, leader);
-  if (GetLaneId() == leader) {
+  __SYNCWARP();
+  unsigned Active = __ACTIVEMASK();
+  unsigned LaneMaskLt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
+  unsigned Rank = __popc(Active & LaneMaskLt);
+  if (Rank == 0) {
     parallelLevel[GetWarpId()] -=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __threadfence();
   }
-  __SHFL_SYNC(tnum, leader, leader);
+  __SYNCWARP();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -55,11 +55,22 @@
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down_sync((mask), (var), (delta), (width))
 #define __ACTIVEMASK() __activemask()
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __SYNCWARP() __syncwarp()
+#else
+// For .target sm_6x or below, all threads in mask must execute the same
+// __syncwarp() in convergence, and the union of all values in mask must be
+// equal to the active mask. Otherwise, the behavior is undefined.
+// (https://docs.nvidia.com/cuda/archive/9.0/cuda-c-programming-guide/index.html#synchronization-functions)
+#define __SYNCWARP() __syncwarp(__activemask())
+#endif // __CUDA_ARCH__
 #else
 #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down((var), (delta), (width))
 #define __ACTIVEMASK() __ballot(1)
+// In Cuda < 9.0 no need to sync threads in warps.
+#define __SYNCWARP()
 #endif // CUDA_VERSION
 
 #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory");


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D65013.210884.patch
Type: text/x-patch
Size: 2845 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20190719/de8b8485/attachment.bin>