[Openmp-commits] [openmp] r350405 - [OPENMP][NVPTX]Improve performance + reduce number of used registers.

Alexey Bataev via Openmp-commits openmp-commits at lists.llvm.org
Fri Jan 4 09:09:13 PST 2019


Author: abataev
Date: Fri Jan  4 09:09:12 2019
New Revision: 350405

URL: http://llvm.org/viewvc/llvm-project?rev=350405&view=rev
Log:
[OPENMP][NVPTX]Improve performance + reduce number of used registers.

Summary:
Reduced number of the used register + improved performance propagating
the information about current execution/data sharing mode directly from
the compiler, where it is possible.
In some cases, it requires new/reworked interfaces of the runtime
external functions. Old functions are marked as deprecated.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jfb, openmp-commits, caomhin

Differential Revision: https://reviews.llvm.org/D56278

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Fri Jan  4 09:09:12 2019
@@ -43,8 +43,8 @@ __device__ static bool IsWarpMasterActiv
   return (unsigned)Sh == 0;
 }
 // Return true if this is the master thread.
-__device__ static bool IsMasterThread() {
-  return !isSPMDMode() && getMasterThreadId() == getThreadId();
+__device__ static bool IsMasterThread(bool isSPMDExecutionMode) {
+  return !isSPMDExecutionMode && getMasterThreadId() == getThreadId();
 }
 
 /// Return the provided size aligned to the size of a pointer.
@@ -88,7 +88,8 @@ __kmpc_initialize_data_sharing_environme
 
   omptarget_nvptx_TeamDescr *teamDescr =
       &omptarget_nvptx_threadPrivateContext->TeamContext();
-  __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread());
+  __kmpc_data_sharing_slot *RootS =
+      teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
 
   DataSharingState.SlotPtr[WID] = RootS;
   DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
@@ -253,8 +254,9 @@ EXTERN void __kmpc_data_sharing_environm
 
       // The master thread cleans the saved slot, because this is an environment
       // only for the master.
-      __kmpc_data_sharing_slot *S =
-          IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID];
+      __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
+                                        ? *SavedSharedSlot
+                                        : DataSharingState.SlotPtr[WID];
 
       if (S->Next) {
         free(S->Next);
@@ -472,8 +474,9 @@ EXTERN void* __kmpc_data_sharing_push_st
   // space for the variables of each thread in the warp,
   // i.e. one DataSize chunk per warp lane.
   // TODO: change WARPSIZE to the number of active threads in the warp.
-  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
-      DataSize : WARPSIZE * DataSize;
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
+                        ? DataSize
+                        : WARPSIZE * DataSize;
 
   // Compute the start address of the frame of each thread in the warp.
   uintptr_t FrameStartAddress =
@@ -553,14 +556,15 @@ EXTERN void __kmpc_get_shared_variables(
 // manage statically allocated global memory. This memory is allocated by the
 // compiler and used to correctly implement globalization of the variables in
 // target, teams and distribute regions.
-EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+                                          const void *buf, size_t size,
                                           int16_t is_shared,
                                           const void **frame) {
   if (is_shared) {
     *frame = buf;
     return;
   }
-  if (isSPMDMode()) {
+  if (isSPMDExecutionMode) {
     if (GetThreadIdInBlock() == 0) {
       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
     }
@@ -574,10 +578,11 @@ EXTERN void __kmpc_get_team_static_memor
   __threadfence();
 }
 
-EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) {
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+                                              int16_t is_shared) {
   if (is_shared)
     return;
-  if (isSPMDMode()) {
+  if (isSPMDExecutionMode) {
     // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
     __SYNCTHREADS();
     if (GetThreadIdInBlock() == 0) {

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h Fri Jan  4 09:09:12 2019
@@ -395,9 +395,13 @@ EXTERN void __kmpc_reduce_conditional_la
 // reduction
 EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
 EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait(
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct);
 EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
@@ -550,9 +554,11 @@ __kmpc_get_data_sharing_environment_fram
 // SPMD execution mode interrogation function.
 EXTERN int8_t __kmpc_is_spmd_exec_mode();
 
-EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+                                          const void *buf, size_t size,
                                           int16_t is_shared, const void **res);
 
-EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared);
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+                                              int16_t is_shared);
 
 #endif

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu Fri Jan  4 09:09:12 2019
@@ -39,14 +39,17 @@ EXTERN void omp_set_num_threads(int num)
   if (num <= 0) {
     WARNING0(LW_INPUT, "expected positive num; ignore\n");
   } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
     currTaskDescr->NThreads() = num;
   }
 }
 
 EXTERN int omp_get_num_threads(void) {
-  int tid = GetLogicalThreadIdInBlock();
-  int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  bool isSPMDExecutionMode = isSPMDMode();
+  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  int rc =
+      GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
   PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
   return rc;
 }
@@ -58,7 +61,8 @@ EXTERN int omp_get_max_threads(void) {
     // We're already in parallel region.
     return 1;  // default is 1 thread avail
   }
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(isSPMDMode());
   int rc = 1; // default is 1 thread avail
   if (!currTaskDescr->InParallelRegion()) {
     // Not currently in a parallel region, return what was set.
@@ -76,21 +80,23 @@ EXTERN int omp_get_thread_limit(void) {
     return 0;  // default is 0
   }
   // per contention group.. meaning threads in current team
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(isSPMDMode());
   int rc = currTaskDescr->ThreadLimit();
   PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
   return rc;
 }
 
 EXTERN int omp_get_thread_num() {
-  int tid = GetLogicalThreadIdInBlock();
-  int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+  bool isSPMDExecutionMode = isSPMDMode();
+  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized());
   PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
   return rc;
 }
 
 EXTERN int omp_get_num_procs(void) {
-  int rc = GetNumberOfProcsInDevice();
+  int rc = GetNumberOfProcsInDevice(isSPMDMode());
   PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
   return rc;
 }
@@ -102,7 +108,8 @@ EXTERN int omp_in_parallel(void) {
             "Expected SPMD mode only with uninitialized runtime.");
     rc = 1;  // SPMD mode is always in parallel.
   } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(isSPMDMode());
     if (currTaskDescr->InParallelRegion()) {
       rc = 1;
     }
@@ -161,7 +168,8 @@ EXTERN int omp_get_level(void) {
     return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
   }
   int level = 0;
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(isSPMDMode());
   ASSERT0(LT_FUSSY, currTaskDescr,
           "do not expect fct to be called in a non-active thread");
   do {
@@ -181,7 +189,8 @@ EXTERN int omp_get_active_level(void) {
     return 1;
   }
   int level = 0; // no active level parallelism
-  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      getMyTopTaskDescriptor(isSPMDMode());
   ASSERT0(LT_FUSSY, currTaskDescr,
           "do not expect fct to be called in a non-active thread");
   do {
@@ -208,7 +217,8 @@ EXTERN int omp_get_ancestor_thread_num(i
   } else if (level > 0) {
     int totLevel = omp_get_level();
     if (level <= totLevel) {
-      omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+      omptarget_nvptx_TaskDescr *currTaskDescr =
+          getMyTopTaskDescriptor(isSPMDMode());
       int steps = totLevel - level;
       PRINT(LD_IO, "backtrack %d steps\n", steps);
       ASSERT0(LT_FUSSY, currTaskDescr,
@@ -259,7 +269,8 @@ EXTERN int omp_get_team_size(int level)
   } else if (level > 0) {
     int totLevel = omp_get_level();
     if (level <= totLevel) {
-      omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+      omptarget_nvptx_TaskDescr *currTaskDescr =
+          getMyTopTaskDescriptor(isSPMDMode());
       int steps = totLevel - level;
       ASSERT0(LT_FUSSY, currTaskDescr,
               "do not expect fct to be called in a non-active thread");
@@ -288,7 +299,8 @@ EXTERN void omp_get_schedule(omp_sched_t
     *kind = omp_sched_static;
     *modifier = 1;
   } else {
-    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(isSPMDMode());
     *kind = currTaskDescr->GetRuntimeSched();
     *modifier = currTaskDescr->RuntimeChunkSize();
   }
@@ -305,7 +317,8 @@ EXTERN void omp_set_schedule(omp_sched_t
     return;
   }
   if (kind >= omp_sched_static && kind < omp_sched_auto) {
-    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(isSPMDMode());
     currTaskDescr->SetRuntimeSched(kind);
     currTaskDescr->RuntimeChunkSize() = modifier;
     PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu Fri Jan  4 09:09:12 2019
@@ -101,7 +101,7 @@ public:
     // When IsRuntimeUninitialized is true, we assume that the caller is
     // in an L0 parallel region and that all worker threads participate.
 
-    int tid = GetLogicalThreadIdInBlock();
+    int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);
 
     // Assume we are in teams region or that we use a single block
     // per target region
@@ -208,7 +208,7 @@ public:
                                    ST chunk) {
     ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
             "Expected non-SPMD mode + initialized runtime.");
-    int tid = GetLogicalThreadIdInBlock();
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
     T tnum = currTaskDescr->ThreadsInTeam();
     T tripCount = ub - lb + 1; // +1 because ub is inclusive
@@ -417,17 +417,18 @@ public:
   // On Pascal, with inlining of the runtime into the user application,
   // this code deadlocks.  This is probably because different threads
   // in a warp cannot make independent progress.
-  NOINLINE static int dispatch_next(int32_t gtid, int32_t *plast, T *plower,
-                                    T *pupper, ST *pstride) {
-    ASSERT0(LT_FUSSY, isRuntimeInitialized(),
+  NOINLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid,
+                                    int32_t *plast, T *plower, T *pupper,
+                                    ST *pstride) {
+    ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
             "Expected non-SPMD mode + initialized runtime.");
     // ID of a thread in its own warp
 
     // automatically selects thread or warp ID based on selected implementation
-    int tid = GetLogicalThreadIdInBlock();
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
     ASSERT0(LT_FUSSY,
-            gtid < GetNumberOfOmpThreads(tid, isSPMDMode(),
-                                         isRuntimeUninitialized()),
+            gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
+                                         checkRuntimeUninitialized(loc)),
             "current thread is not needed here; error");
     // retrieve schedule
     kmp_sched_t schedule =
@@ -540,7 +541,7 @@ EXTERN int __kmpc_dispatch_next_4(kmp_Id
                                   int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
   return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
-      tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st);
 }
 
 EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
@@ -548,14 +549,14 @@ EXTERN int __kmpc_dispatch_next_4u(kmp_I
                                    uint32_t *p_ub, int32_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
   return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
-      tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st);
 }
 
 EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
                                   int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
   return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
-      tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st);
 }
 
 EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
@@ -563,7 +564,7 @@ EXTERN int __kmpc_dispatch_next_8u(kmp_I
                                    uint64_t *p_ub, int64_t *p_st) {
   PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
   return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
-      tid, p_last, p_lb, p_ub, p_st);
+      loc, tid, p_last, p_lb, p_ub, p_st);
 }
 
 // fini
@@ -756,7 +757,7 @@ EXTERN void __kmpc_reduce_conditional_la
           "Expected non-SPMD mode + initialized runtime.");
 
   omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
-  int tid = GetLogicalThreadIdInBlock();
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
                                               checkRuntimeUninitialized(loc));
   uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Fri Jan  4 09:09:12 2019
@@ -61,12 +61,12 @@ EXTERN void __kmpc_kernel_init(int Threa
       omptarget_nvptx_device_State[slot].Dequeue();
 
   // init thread private
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false);
   omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
 
   // init team context
   omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
-  currTeamDescr.InitTeamDescr();
+  currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false);
   // this thread will start execution... has to update its task ICV
   // to point to the level zero task ICV. That ICV was init in
   // InitTeamDescr()
@@ -128,7 +128,7 @@ EXTERN void __kmpc_spmd_kernel_init(int
     omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
     omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
     // init team context
-    currTeamDescr.InitTeamDescr();
+    currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true);
   }
   // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
   __SYNCTHREADS();

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Fri Jan  4 09:09:12 2019
@@ -176,7 +176,7 @@ public:
     prev = taskDescr;
   }
   // init & copy
-  INLINE void InitLevelZeroTaskDescr();
+  INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
   INLINE void InitLevelOneTaskDescr(uint16_t tnum,
                                     omptarget_nvptx_TaskDescr *parentTaskDescr);
   INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
@@ -257,7 +257,7 @@ public:
   INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
 
   // init
-  INLINE void InitTeamDescr();
+  INLINE void InitTeamDescr(bool isSPMDExecutionMode);
 
   INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
     // If this is invoked by the master thread of the master warp then intialize
@@ -462,7 +462,8 @@ extern volatile __device__ __shared__ om
 
 INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
 INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor();
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode);
 INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
 
 ////////////////////////////////////////////////////////////////////////////////

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h Fri Jan  4 09:09:12 2019
@@ -31,7 +31,8 @@ INLINE void omptarget_nvptx_TaskDescr::S
   items.flags |= val;
 }
 
-INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
+INLINE void
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
   // slow method
   // flag:
   //   default sched is static,
@@ -39,7 +40,7 @@ INLINE void omptarget_nvptx_TaskDescr::I
   //   not in parallel
 
   items.flags = 0;
-  items.nthreads = GetNumberOfProcsInTeam();
+  items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
   ;                                // threads: whatever was alloc by kernel
   items.threadId = 0;         // is master
   items.threadsInTeam = 1;    // sequential
@@ -177,8 +178,8 @@ omptarget_nvptx_ThreadPrivateContext::In
 // Team Descriptor
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
-  levelZeroTaskDescr.InitLevelZeroTaskDescr();
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) {
+  levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -199,8 +200,9 @@ INLINE omptarget_nvptx_TaskDescr *getMyT
   return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
 }
 
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
-  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
+  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
 }
 
 ////////////////////////////////////////////////////////////////////////////////

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu Fri Jan  4 09:09:12 2019
@@ -57,7 +57,7 @@ EXTERN bool __kmpc_kernel_convergent_sim
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
   *LaneId = __popc(ConvergentMask & lanemask_lt);
 
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
   int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
 
   ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
@@ -101,7 +101,7 @@ EXTERN bool __kmpc_kernel_convergent_sim
 EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
   // pop stack
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
   ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
   omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
       job->slimForNextSimd;
@@ -131,7 +131,7 @@ EXTERN bool __kmpc_kernel_convergent_par
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
   uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
 
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
   int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
 
   ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
@@ -181,7 +181,7 @@ EXTERN bool __kmpc_kernel_convergent_par
 EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
   // pop stack
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
   ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, job->convHeadTaskDescr);
@@ -345,7 +345,7 @@ EXTERN void __kmpc_serialized_parallel(k
   }
 
   // assume this is only called for nested parallel
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
 
   // unlike actual parallel, threads in the same team do not share
   // the workTaskDescr in this case and num threads is fixed to 1
@@ -384,7 +384,7 @@ EXTERN void __kmpc_end_serialized_parall
   }
 
   // pop stack
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   // set new top
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
@@ -404,7 +404,7 @@ EXTERN uint16_t __kmpc_parallel_level(km
     return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
   }
 
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_TaskDescr *currTaskDescr =
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
   if (currTaskDescr->InL2OrHigherParallelRegion())
@@ -420,7 +420,7 @@ EXTERN uint16_t __kmpc_parallel_level(km
 // it's cheap to recalculate this value so we never use the result
 // of this call.
 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
-  int tid = GetLogicalThreadIdInBlock();
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   return GetOmpThreadId(tid, checkSPMDMode(loc),
                         checkRuntimeUninitialized(loc));
 }
@@ -433,7 +433,7 @@ EXTERN void __kmpc_push_num_threads(kmp_
                                     int32_t num_threads) {
   PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock();
+  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
       num_threads;
 }
@@ -442,7 +442,7 @@ EXTERN void __kmpc_push_simd_limit(kmp_I
                                    int32_t simd_limit) {
   PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock();
+  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
 }
 

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/reduction.cu Fri Jan  4 09:09:12 2019
@@ -20,8 +20,10 @@
 // may eventually remove this
 EXTERN
 int32_t __gpu_block_reduce() {
-  int tid = GetLogicalThreadIdInBlock();
-  int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  bool isSPMDExecutionMode = isSPMDMode();
+  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  int nt =
+      GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
   if (nt != blockDim.x)
     return 0;
   unsigned tnum = __ACTIVEMASK();
@@ -35,7 +37,7 @@ int32_t __kmpc_reduce_gpu(kmp_Ident *loc
                           size_t reduce_size, void *reduce_data,
                           void *reduce_array_size, kmp_ReductFctPtr *reductFct,
                           kmp_CriticalName *lck) {
-  int threadId = GetLogicalThreadIdInBlock();
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   int numthread;
   if (currTaskDescr->IsParallelConstruct()) {
@@ -150,7 +152,7 @@ int32_t nvptx_parallel_reduce_nowait(int
                                      kmp_InterWarpCopyFctPtr cpyFct,
                                      bool isSPMDExecutionMode,
                                      bool isRuntimeUninitialized) {
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
   uint32_t NumThreads = GetNumberOfOmpThreads(
       BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
   if (NumThreads == 1)
@@ -236,8 +238,7 @@ int32_t nvptx_parallel_reduce_nowait(int
 #endif // __CUDA_ARCH__ >= 700
 }
 
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait(
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
   return nvptx_parallel_reduce_nowait(
@@ -247,6 +248,16 @@ int32_t __kmpc_nvptx_parallel_reduce_now
 }
 
 EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN
 int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
@@ -272,7 +283,7 @@ int32_t nvptx_teams_reduce_nowait(
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
     kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
-  uint32_t ThreadId = GetLogicalThreadIdInBlock();
+  uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
   // In non-generic mode all workers participate in the teams reduction.
   // In generic mode only the team master participates in the teams
   // reduction because the workers are waiting for parallel work.

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/support.h Fri Jan  4 09:09:12 2019
@@ -43,7 +43,7 @@ INLINE int GetNumberOfBlocksInKernel();
 INLINE int GetNumberOfThreadsInBlock();
 
 // get global ids to locate tread/team info (constant regardless of OMP)
-INLINE int GetLogicalThreadIdInBlock();
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
 INLINE int GetMasterThreadID();
 INLINE int GetNumberOfWorkersInTeam();
 
@@ -59,8 +59,8 @@ GetNumberOfOmpThreads(int threadId, bool
 INLINE int GetNumberOfOmpTeams();                   // omp_num_teams
 
 // get OpenMP number of procs
-INLINE int GetNumberOfProcsInTeam();
-INLINE int GetNumberOfProcsInDevice();
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
 
 // masters
 INLINE int IsTeamMaster(int ompThreadId);

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h Fri Jan  4 09:09:12 2019
@@ -130,11 +130,11 @@ INLINE int GetNumberOfWorkersInTeam() {
 // or a serial region by the master.  If the master (whose CUDA thread
 // id is GetMasterThreadID()) calls this routine, we return 0 because
 // it is a shadow for the first worker.
-INLINE int GetLogicalThreadIdInBlock() {
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
   // Implemented using control flow (predication) instead of with a modulo
   // operation.
   int tid = GetThreadIdInBlock();
-  if (isGenericMode() && tid >= GetMasterThreadID())
+  if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
     return 0;
   else
     return tid;
@@ -214,13 +214,15 @@ INLINE int IsTeamMaster(int ompThreadId)
 // get OpenMP number of procs
 
 // Get the number of processors in the device.
-INLINE int GetNumberOfProcsInDevice() {
-  if (isGenericMode())
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
+  if (!isSPMDExecutionMode)
     return GetNumberOfWorkersInTeam();
   return GetNumberOfThreadsInBlock();
 }
 
-INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
+  return GetNumberOfProcsInDevice(isSPMDExecutionMode);
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Memory

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu Fri Jan  4 09:09:12 2019
@@ -46,7 +46,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *lo
             "Expected SPMD mode with uninitialized runtime.");
     __kmpc_barrier_simple_spmd(loc_ref, tid);
   } else {
-    tid = GetLogicalThreadIdInBlock();
+    tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
     omptarget_nvptx_TaskDescr *currTaskDescr =
         omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
     int numberOfActiveOMPThreads = GetNumberOfOmpThreads(

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu?rev=350405&r1=350404&r2=350405&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu Fri Jan  4 09:09:12 2019
@@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps
           "bad assumptions");
 
   // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock();
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
   newTaskDescr->CopyForExplicitTask(parentTaskDescr);
   // set new task descriptor as top
@@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(km
           "bad assumptions");
 
   // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock();
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
   newTaskDescr->CopyForExplicitTask(parentTaskDescr);
   // set new task descriptor as top
@@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0
   omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
   // 3... noting to call... is inline
   // 4. pop context
-  int tid = GetLogicalThreadIdInBlock();
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
                                                              parentTaskDescr);
   // 5. free




More information about the Openmp-commits mailing list