[Openmp-commits] [openmp] a7b7b5d - [OpenMP] Create and use `__kmpc_is_generic_main_thread`

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Sun Jul 11 17:18:37 PDT 2021


Author: Johannes Doerfert
Date: 2021-07-11T19:18:03-05:00
New Revision: a7b7b5dfe5a931a76cbe8410e5a9f55beea73c8e

URL: https://github.com/llvm/llvm-project/commit/a7b7b5dfe5a931a76cbe8410e5a9f55beea73c8e
DIFF: https://github.com/llvm/llvm-project/commit/a7b7b5dfe5a931a76cbe8410e5a9f55beea73c8e.diff

LOG: [OpenMP] Create and use `__kmpc_is_generic_main_thread`

In order to fold calls based on high-level knowledge and control flow
tracking it helps to expose the information as a runtime call. The
logic: `!SPMD && getTID() == getMasterTID()` was used in various places
and is now encapsulated in `__kmpc_is_generic_main_thread`. As part of
this rewrite we replaced eager computation of arguments with on-demand
computation, especially helpful if the calls can be folded and arguments
don't need to be computed consequently.

Differential Revision: https://reviews.llvm.org/D105768

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/common/omptargeti.h
    openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
    openmp/libomptarget/deviceRTLs/common/src/libcall.cu
    openmp/libomptarget/deviceRTLs/common/src/loop.cu
    openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
    openmp/libomptarget/deviceRTLs/common/src/parallel.cu
    openmp/libomptarget/deviceRTLs/common/src/reduction.cu
    openmp/libomptarget/deviceRTLs/common/src/support.cu
    openmp/libomptarget/deviceRTLs/common/src/sync.cu
    openmp/libomptarget/deviceRTLs/common/src/task.cu
    openmp/libomptarget/deviceRTLs/common/support.h
    openmp/libomptarget/deviceRTLs/interface.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
index 888b66b541ee..02feaf5f30bd 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -192,7 +192,7 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
 
 INLINE omptarget_nvptx_TaskDescr *
 getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
-  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
+  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
 }
 
 ////////////////////////////////////////////////////////////////////////////////

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
index 65db27b63bdf..445e8c1faf0a 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -15,11 +15,6 @@
 #include "target/shuffle.h"
 #include "target_impl.h"
 
-// Return true if this is the master thread.
-INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
-  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////
@@ -66,7 +61,8 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
 
 EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
   Bytes = Bytes + (Bytes % MinBytes);
-  if (IsMasterThread(__kmpc_is_spmd_exec_mode())) {
+  int TID = GetThreadIdInBlock();
+  if (__kmpc_is_generic_main_thread(TID)) {
     // Main thread alone, use shared memory if space is available.
     if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
       void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]];
@@ -75,7 +71,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
       return Ptr;
     }
   } else {
-    int TID = GetThreadIdInBlock();
     int WID = GetWarpId();
     unsigned WarpBytes = Bytes * WARPSIZE;
     auto AllocSharedStack = [&]() {
@@ -92,7 +87,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
       return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes);
   }
   // Fallback to malloc
-  int TID = GetThreadIdInBlock();
   unsigned WarpBytes = Bytes * WARPSIZE;
   auto AllocGlobal = [&] {
     return SafeMalloc(WarpBytes, "AllocGlobalFallback");

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
index ae0590284d06..65816aeb863d 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -68,9 +68,7 @@ EXTERN int omp_get_thread_limit(void) {
 }
 
 EXTERN int omp_get_thread_num() {
-  bool isSPMDExecutionMode = __kmpc_is_spmd_exec_mode();
-  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
-  int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
+  int rc = GetOmpThreadId();
   PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
   return rc;
 }

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
index 709905724192..e6d392c4c64d 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -210,7 +210,7 @@ public:
       ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
       return;
     }
-    int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+    int tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
     T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
     T tripCount = ub - lb + 1; // +1 because ub is inclusive
@@ -453,7 +453,7 @@ public:
     // ID of a thread in its own warp
 
     // automatically selects thread or warp ID based on selected implementation
-    int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+    int tid = GetLogicalThreadIdInBlock();
     ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
             "current thread is not needed here; error");
     // retrieve schedule

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 2a5cc312376a..153754fc3fdd 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -160,6 +160,10 @@ EXTERN int8_t __kmpc_is_spmd_exec_mode() {
   return (execution_param & ModeMask) == Spmd;
 }
 
+EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
+  return !__kmpc_is_spmd_exec_mode() && GetMasterThreadID() == Tid;
+}
+
 EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
 
 static void __kmpc_target_region_state_machine(ident_t *Ident) {

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index d404015eb881..2fd7b0585ad6 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -188,7 +188,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
   }
 
   // assume this is only called for nested parallel
-  int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+  int threadId = GetLogicalThreadIdInBlock();
 
   // unlike actual parallel, threads in the same team do not share
   // the workTaskDescr in this case and num threads is fixed to 1
@@ -227,7 +227,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
   }
 
   // pop stack
-  int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+  int threadId = GetLogicalThreadIdInBlock();
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   // set new top
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
@@ -249,8 +249,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
 // it's cheap to recalculate this value so we never use the result
 // of this call.
 EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
-  int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
-  return GetOmpThreadId(tid, __kmpc_is_spmd_exec_mode());
+  return GetOmpThreadId();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -262,7 +261,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
   PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
   ASSERT0(LT_FUSSY, isRuntimeInitialized(),
           "Runtime must be initialized.");
-  tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+  tid = GetLogicalThreadIdInBlock();
   omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
       num_threads;
 }

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index da025f4acd11..6c02790ac0aa 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -69,7 +69,7 @@ static int32_t nvptx_parallel_reduce_nowait(
     int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
     kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
     bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
   uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
   if (NumThreads == 1)
     return 1;
@@ -184,10 +184,11 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     kmp_ListGlobalFctPtr glredFct) {
 
   // Terminate all threads in non-SPMD mode except for the master thread.
-  if (!__kmpc_is_spmd_exec_mode() && GetThreadIdInBlock() != GetMasterThreadID())
+  if (!__kmpc_is_spmd_exec_mode() &&
+      !__kmpc_is_generic_main_thread(GetThreadIdInBlock()))
     return 0;
 
-  uint32_t ThreadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+  uint32_t ThreadId = GetLogicalThreadIdInBlock();
 
   // In non-generic mode all workers participate in the teams reduction.
   // In generic mode only the team master participates in the teams

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
index d711c2a9d708..f9014dfc4284 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -67,11 +67,11 @@ int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
 // or a serial region by the master.  If the master (whose CUDA thread
 // id is GetMasterThreadID()) calls this routine, we return 0 because
 // it is a shadow for the first worker.
-int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
+int GetLogicalThreadIdInBlock() {
   // Implemented using control flow (predication) instead of with a modulo
   // operation.
   int tid = GetThreadIdInBlock();
-  if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
+  if (__kmpc_is_generic_main_thread(tid))
     return 0;
   else
     return tid;
@@ -83,16 +83,19 @@ int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
+int GetOmpThreadId() {
+  int tid = GetThreadIdInBlock();
+  if (__kmpc_is_generic_main_thread(tid))
+    return 0;
   // omp_thread_num
   int rc;
   if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
     rc = 0;
-  } else if (isSPMDExecutionMode) {
-    rc = GetThreadIdInBlock();
+  } else if (__kmpc_is_spmd_exec_mode()) {
+    rc = tid;
   } else {
     omptarget_nvptx_TaskDescr *currTaskDescr =
-        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
     ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
     rc = currTaskDescr->ThreadId();
   }

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
index 0dfbacf68051..1dcd9abfa9e6 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -47,7 +47,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
             "Expected SPMD mode with uninitialized runtime.");
     __kmpc_barrier_simple_spmd(loc_ref, tid);
   } else {
-    tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+    tid = GetLogicalThreadIdInBlock();
     int numberOfActiveOMPThreads =
         GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
     if (numberOfActiveOMPThreads > 1) {

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu
index 23470e796d72..3c6020c5d6c2 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/task.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu
@@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
           "bad assumptions");
 
   // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+  int tid = GetLogicalThreadIdInBlock();
   omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
   newTaskDescr->CopyForExplicitTask(parentTaskDescr);
   // set new task descriptor as top
@@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
           "bad assumptions");
 
   // 2. push new context: update new task descriptor
-  int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+  int tid = GetLogicalThreadIdInBlock();
   omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
   newTaskDescr->CopyForExplicitTask(parentTaskDescr);
   // set new task descriptor as top
@@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
   omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
   // 3... noting to call... is inline
   // 4. pop context
-  int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+  int tid = GetLogicalThreadIdInBlock();
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
                                                              parentTaskDescr);
   // 5. free

diff  --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
index 92a3f82df374..7a4e46feead2 100644
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -41,13 +41,12 @@ bool isRuntimeInitialized();
 ////////////////////////////////////////////////////////////////////////////////
 
 // get global ids to locate tread/team info (constant regardless of OMP)
-int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+int GetLogicalThreadIdInBlock();
 int GetMasterThreadID();
 int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-int GetOmpThreadId(int threadId,
-                   bool isSPMDExecutionMode); // omp_thread_num
+int GetOmpThreadId();                         // omp_thread_num
 int GetOmpTeamId();                           // omp_team_num
 
 // get OpenMP number of threads and team

diff  --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index e0c433060c85..513075779972 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -449,6 +449,10 @@ EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
 // SPMD execution mode interrogation function.
 EXTERN int8_t __kmpc_is_spmd_exec_mode();
 
+/// Return true if the hardware thread id \p Tid represents the OpenMP main
+/// thread in generic mode outside of a parallel region.
+EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);
+
 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
                                           const void *buf, size_t size,
                                           int16_t is_shared, const void **res);


        


More information about the Openmp-commits mailing list