[Openmp-commits] [openmp] r350431 - [OPENMP][NVPTX]General formatting/code improvement, NFC.

Fri Jan 4 12:16:54 PST 2019

Author: abataev
Date: Fri Jan  4 12:16:54 2019
New Revision: 350431

URL: http://llvm.org/viewvc/llvm-project?rev=350431&view=rev
Log:
[OPENMP][NVPTX]General formatting/code improvement, NFC.

Summary: Formatting.

Reviewers: gtbercea, grokos, kkwli0

Subscribers: guansong, openmp-commits, caomhin

Differential Revision: https://reviews.llvm.org/D56290

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================

--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Fri Jan  4 12:16:54 2019
@@ -13,42 +13,26 @@
 #include "omptarget-nvptx.h"
 #include <stdio.h>
 
-// Number of threads in the CUDA block.
-__device__ static unsigned getNumThreads() { return blockDim.x; }
-// Thread ID in the CUDA block
-__device__ static unsigned getThreadId() { return threadIdx.x; }
 // Warp ID in the CUDA block
-__device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
 // Lane ID in the CUDA warp.
-__device__ static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
-
-// The CUDA thread ID of the master thread.
-__device__ static unsigned getMasterThreadId() {
-  unsigned Mask = WARPSIZE - 1;
-  return (getNumThreads() - 1) & (~Mask);
-}
-
-// Find the active threads in the warp - return a mask whose n-th bit is set if
-// the n-th thread in the warp is active.
-__device__ static unsigned getActiveThreadsMask() {
-  return __BALLOT_SYNC(0xFFFFFFFF, true);
-}
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
 
 // Return true if this is the first active thread in the warp.
-__device__ static bool IsWarpMasterActiveThread() {
-  unsigned long long Mask = getActiveThreadsMask();
-  unsigned long long ShNum = WARPSIZE - (getThreadId() % WARPSIZE);
+INLINE static bool IsWarpMasterActiveThread() {
+  unsigned long long Mask = __ACTIVEMASK();
+  unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
   unsigned long long Sh = Mask << ShNum;
   // Truncate Sh to the 32 lower bits
   return (unsigned)Sh == 0;
 }
 // Return true if this is the master thread.
-__device__ static bool IsMasterThread(bool isSPMDExecutionMode) {
-  return !isSPMDExecutionMode && getMasterThreadId() == getThreadId();
+INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
+  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
 }
 
 /// Return the provided size aligned to the size of a pointer.
-__device__ static size_t AlignVal(size_t Val) {
+INLINE static size_t AlignVal(size_t Val) {
   const size_t Align = (size_t)sizeof(void *);
   if (Val & (Align - 1)) {
     Val += Align;
@@ -128,7 +112,7 @@ EXTERN void *__kmpc_data_sharing_environ
           (unsigned long long)SharingDefaultDataSize);
 
   unsigned WID = getWarpId();
-  unsigned CurActiveThreads = getActiveThreadsMask();
+  unsigned CurActiveThreads = __ACTIVEMASK();
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
@@ -268,7 +252,7 @@ EXTERN void __kmpc_data_sharing_environm
     return;
   }
 
-  int32_t CurActive = getActiveThreadsMask();
+  int32_t CurActive = __ACTIVEMASK();
 
   // Only the warp master can restore the stack and frame information, and only
   // if there are no other threads left behind in this environment (i.e. the
@@ -341,7 +325,7 @@ __kmpc_get_data_sharing_environment_fram
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE void data_sharing_init_stack_common() {
+INLINE static void data_sharing_init_stack_common() {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
   omptarget_nvptx_TeamDescr *teamDescr =
       &omptarget_nvptx_threadPrivateContext->TeamContext();
@@ -380,11 +364,11 @@ EXTERN void __kmpc_data_sharing_init_sta
   __threadfence_block();
 }
 
-INLINE void* data_sharing_push_stack_common(size_t PushSize) {
+INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
 
   // Only warp active master threads manage the stack.
-  bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
+  bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
 
   // Add worst-case padding to DataSize so that future stack allocations are
   // correctly aligned.
@@ -394,7 +378,7 @@ INLINE void* data_sharing_push_stack_com
   // Frame pointer must be visible to all workers in the same warp.
   const unsigned WID = getWarpId();
   void *FrameP = 0;
-  const int32_t CurActive = getActiveThreadsMask();
+  int32_t CurActive = __ACTIVEMASK();
 
   if (IsWarpMaster) {
     // SlotP will point to either the shared memory slot or an existing
@@ -454,8 +438,8 @@ INLINE void* data_sharing_push_stack_com
   return FrameP;
 }
 
-EXTERN void* __kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
-    int16_t UseSharedMemory) {
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+                                                      int16_t UseSharedMemory) {
   return data_sharing_push_stack_common(DataSize);
 }
 
@@ -466,8 +450,8 @@ EXTERN void* __kmpc_data_sharing_coalesc
 // By default the globalized variables are stored in global memory. If the
 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
 // as long as the size requested fits the pre-allocated size.
-EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
-    int16_t UseSharedMemory) {
+EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
+                                            int16_t UseSharedMemory) {
   // Compute the total memory footprint of the requested data.
   // The master thread requires a stack only for itself. A worker
   // thread (which at this point is a warp master) will require
@@ -495,7 +479,7 @@ EXTERN void __kmpc_data_sharing_pop_stac
 
   __threadfence_block();
 
-  if (getThreadId() % WARPSIZE == 0) {
+  if (GetThreadIdInBlock() % WARPSIZE == 0) {
     unsigned WID = getWarpId();
 
     // Current slot
@@ -572,7 +556,7 @@ EXTERN void __kmpc_get_team_static_memor
     __SYNCTHREADS();
     return;
   }
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
   __threadfence();
@@ -591,7 +575,7 @@ EXTERN void __kmpc_restore_team_static_m
     return;
   }
   __threadfence();
-  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   omptarget_nvptx_simpleMemoryManager.Release();
 }

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/debug.h Fri Jan  4 12:16:54 2019
@@ -130,7 +130,7 @@
 #include "option.h"
 
 template <typename... Arguments>
-static NOINLINE void log(const char *fmt, Arguments... parameters) {
+NOINLINE static void log(const char *fmt, Arguments... parameters) {
   printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
          (int)(threadIdx.x & 0x1F), parameters...);
 }

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Fri Jan  4 12:16:54 2019
@@ -414,12 +414,8 @@ public:
     return FINISHED;
   }
 
-  // On Pascal, with inlining of the runtime into the user application,
-  // this code deadlocks.  This is probably because different threads
-  // in a warp cannot make independent progress.
-  NOINLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid,
-                                    int32_t *plast, T *plower, T *pupper,
-                                    ST *pstride) {
+  INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
+                                  T *plower, T *pupper, ST *pstride) {
     ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
             "Expected non-SPMD mode + initialized runtime.");
     // ID of a thread in its own warp

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Fri Jan  4 12:16:54 2019
@@ -29,7 +29,7 @@ extern __device__ omptarget_nvptx_Queue<
 // init entry points
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE unsigned smid() {
+INLINE static unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
   return id;

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Fri Jan  4 12:16:54 2019
@@ -53,13 +53,11 @@
 #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down_sync((mask), (var), (delta), (width))
-#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
 #define __ACTIVEMASK() __activemask()
 #else
 #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down((var), (delta), (width))
-#define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
 #define __ACTIVEMASK() __ballot(1)
 #endif
 
@@ -93,7 +91,7 @@ public:
     }
   }
   // Called by all threads.
-  INLINE void **GetArgs() { return args; };
+  INLINE void **GetArgs() const { return args; };
 private:
   // buffer of pre-allocated arguments.
   void *buffer[MAX_SHARED_ARGS];
@@ -104,7 +102,8 @@ private:
   uint32_t nArgs;
 };
 
-extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+extern __device__ __shared__ omptarget_nvptx_SharedArgs
+    omptarget_nvptx_globalArgs;
 
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
@@ -155,23 +154,23 @@ extern __device__ __shared__ DataSharing
 class omptarget_nvptx_TaskDescr {
 public:
   // methods for flags
-  INLINE omp_sched_t GetRuntimeSched();
+  INLINE omp_sched_t GetRuntimeSched() const;
   INLINE void SetRuntimeSched(omp_sched_t sched);
-  INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; }
-  INLINE int InL2OrHigherParallelRegion() {
+  INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
+  INLINE int InL2OrHigherParallelRegion() const {
     return items.flags & TaskDescr_InParL2P;
   }
-  INLINE int IsParallelConstruct() {
+  INLINE int IsParallelConstruct() const {
     return items.flags & TaskDescr_IsParConstr;
   }
-  INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
+  INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
   // methods for other fields
   INLINE uint16_t &NThreads() { return items.nthreads; }
   INLINE uint16_t &ThreadLimit() { return items.threadlimit; }
   INLINE uint16_t &ThreadId() { return items.threadId; }
   INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
   INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
-  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
+  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
   INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
     prev = taskDescr;
   }
@@ -326,7 +325,7 @@ public:
                                    omptarget_nvptx_TaskDescr *taskICV) {
     topTaskDescr[tid] = taskICV;
   }
-  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid);
+  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
   // parallel
   INLINE uint16_t &NumThreadsForNextParallel(int tid) {
     return nextRegion.tnum[tid];
@@ -381,7 +380,7 @@ private:
     volatile unsigned keys[OMP_STATE_COUNT];
   } MemData[MAX_SM];
 
-  INLINE uint32_t hash(unsigned key) const {
+  INLINE static uint32_t hash(unsigned key) {
     return key & (OMP_STATE_COUNT - 1);
   }
 

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h Fri Jan  4 12:16:54 2019
@@ -16,7 +16,7 @@
 // Task Descriptor
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() {
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
   // sched starts from 1..4; encode it as 0..3; so add 1 here
   uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
   return (omp_sched_t)rc;
@@ -155,7 +155,7 @@ INLINE void omptarget_nvptx_TaskDescr::R
 ////////////////////////////////////////////////////////////////////////////////
 
 INLINE omptarget_nvptx_TaskDescr *
-omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) {
+omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
   ASSERT0(
       LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
       "Getting top level, tid is larger than allocated data structure size");

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Fri Jan  4 12:16:54 2019
@@ -193,7 +193,7 @@ EXTERN void __kmpc_kernel_end_convergent
 // support for parallel that goes parallel (1 static level only)
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
+INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
                                                 uint16_t NThreadsICV,
                                                 uint16_t ThreadLimit) {
   uint16_t ThreadsRequested = NThreadsICV;
@@ -236,7 +236,7 @@ EXTERN void __kmpc_kernel_prepare_parall
   // This routine is only called by the team master.  The team master is
   // the first thread of the last warp.  It always has the logical thread
   // id of 0 (since it is a shadow for the first worker thread).
-  int threadId = 0;
+  const int threadId = 0;
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
   ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu Fri Jan  4 12:16:54 2019
@@ -86,7 +86,7 @@ EXTERN int64_t __kmpc_shuffle_int64(int6
    return val;
 }
 
-static INLINE void gpu_regular_warp_reduce(void *reduce_data,
+INLINE static void gpu_regular_warp_reduce(void *reduce_data,
                                            kmp_ShuffleReductFctPtr shflFct) {
   for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
     shflFct(reduce_data, /*LaneId - not used= */ 0,
@@ -94,7 +94,7 @@ static INLINE void gpu_regular_warp_redu
   }
 }
 
-static INLINE void gpu_irregular_warp_reduce(void *reduce_data,
+INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
                                              kmp_ShuffleReductFctPtr shflFct,
                                              uint32_t size, uint32_t tid) {
   uint32_t curr_size;
@@ -108,18 +108,18 @@ static INLINE void gpu_irregular_warp_re
   }
 }
 
-static INLINE uint32_t
+INLINE static uint32_t
 gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
   uint32_t lanemask_lt;
   uint32_t lanemask_gt;
   uint32_t size, remote_id, physical_lane_id;
   physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
-  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  uint32_t Liveness = __ACTIVEMASK();
   uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
   do {
-    Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+    Liveness = __ACTIVEMASK();
     remote_id = __ffs(Liveness & lanemask_gt);
     size = __popc(Liveness);
     logical_lane_id /= 2;
@@ -134,7 +134,7 @@ int32_t __kmpc_nvptx_simd_reduce_nowait(
                                         size_t reduce_size, void *reduce_data,
                                         kmp_ShuffleReductFctPtr shflFct,
                                         kmp_InterWarpCopyFctPtr cpyFct) {
-  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  uint32_t Liveness = __ACTIVEMASK();
   if (Liveness == 0xffffffff) {
     gpu_regular_warp_reduce(reduce_data, shflFct);
     return GetThreadIdInBlock() % WARPSIZE ==
@@ -146,12 +146,10 @@ int32_t __kmpc_nvptx_simd_reduce_nowait(
 }
 
 INLINE
-int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
-                                     size_t reduce_size, void *reduce_data,
-                                     kmp_ShuffleReductFctPtr shflFct,
-                                     kmp_InterWarpCopyFctPtr cpyFct,
-                                     bool isSPMDExecutionMode,
-                                     bool isRuntimeUninitialized) {
+static int32_t nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
   uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
   uint32_t NumThreads = GetNumberOfOmpThreads(
       BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
@@ -195,12 +193,10 @@ int32_t nvptx_parallel_reduce_nowait(int
     if (WarpId == 0)
       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
                                 BlockThreadId);
-
-    return BlockThreadId == 0;
   }
   return BlockThreadId == 0;
 #else
-  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  uint32_t Liveness = __ACTIVEMASK();
   if (Liveness == 0xffffffff) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
@@ -278,7 +274,7 @@ int32_t __kmpc_nvptx_parallel_reduce_now
 }
 
 INLINE
-int32_t nvptx_teams_reduce_nowait(
+static int32_t nvptx_teams_reduce_nowait(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
     kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
@@ -378,7 +374,7 @@ int32_t nvptx_teams_reduce_nowait(
     ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
 
   // Reduce across warps to the warp master.
-  uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+  uint32_t Liveness = __ACTIVEMASK();
   if (Liveness == 0xffffffff) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
   else // Partial warp but contiguous lanes

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queue.h Fri Jan  4 12:16:54 2019
@@ -35,14 +35,14 @@ private:
   static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
   INLINE uint32_t ENQUEUE_TICKET();
   INLINE uint32_t DEQUEUE_TICKET();
-  INLINE uint32_t ID(uint32_t ticket);
+  INLINE static uint32_t ID(uint32_t ticket);
   INLINE bool IsServing(uint32_t slot, uint32_t id);
   INLINE void PushElement(uint32_t slot, ElementType *element);
   INLINE ElementType *PopElement(uint32_t slot);
   INLINE void DoneServing(uint32_t slot, uint32_t id);
 
 public:
-  INLINE omptarget_nvptx_Queue(){};
+  INLINE omptarget_nvptx_Queue() {}
   INLINE void Enqueue(ElementType *element);
   INLINE ElementType *Dequeue();
 };

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h?rev=350431&r1=350430&r2=350431&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/state-queuei.h Fri Jan  4 12:16:54 2019
@@ -30,7 +30,8 @@ INLINE uint32_t omptarget_nvptx_Queue<El
 }
 
 template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
+INLINE uint32_t
+omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
   return (ticket / SIZE) * 2;
 }