[Openmp-commits] [openmp] a7b7b5d - [OpenMP] Create and use `__kmpc_is_generic_main_thread`
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Sun Jul 11 17:18:37 PDT 2021
Author: Johannes Doerfert
Date: 2021-07-11T19:18:03-05:00
New Revision: a7b7b5dfe5a931a76cbe8410e5a9f55beea73c8e
URL: https://github.com/llvm/llvm-project/commit/a7b7b5dfe5a931a76cbe8410e5a9f55beea73c8e
DIFF: https://github.com/llvm/llvm-project/commit/a7b7b5dfe5a931a76cbe8410e5a9f55beea73c8e.diff
LOG: [OpenMP] Create and use `__kmpc_is_generic_main_thread`
In order to fold calls based on high-level knowledge and control flow
tracking it helps to expose the information as a runtime call. The
logic: `!SPMD && getTID() == getMasterTID()` was used in various places
and is now encapsulated in `__kmpc_is_generic_main_thread`. As part of
this rewrite we replaced eager computation of arguments with on-demand
computation, especially helpful if the calls can be folded and arguments
don't need to be computed consequently.
Differential Revision: https://reviews.llvm.org/D105768
Added:
Modified:
openmp/libomptarget/deviceRTLs/common/omptargeti.h
openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
openmp/libomptarget/deviceRTLs/common/src/libcall.cu
openmp/libomptarget/deviceRTLs/common/src/loop.cu
openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
openmp/libomptarget/deviceRTLs/common/src/parallel.cu
openmp/libomptarget/deviceRTLs/common/src/reduction.cu
openmp/libomptarget/deviceRTLs/common/src/support.cu
openmp/libomptarget/deviceRTLs/common/src/sync.cu
openmp/libomptarget/deviceRTLs/common/src/task.cu
openmp/libomptarget/deviceRTLs/common/support.h
openmp/libomptarget/deviceRTLs/interface.h
Removed:
################################################################################
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
index 888b66b541ee..02feaf5f30bd 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -192,7 +192,7 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
- return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
+ return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
index 65db27b63bdf..445e8c1faf0a 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -15,11 +15,6 @@
#include "target/shuffle.h"
#include "target_impl.h"
-// Return true if this is the master thread.
-INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
- return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
-}
-
////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
@@ -66,7 +61,8 @@ static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
Bytes = Bytes + (Bytes % MinBytes);
- if (IsMasterThread(__kmpc_is_spmd_exec_mode())) {
+ int TID = GetThreadIdInBlock();
+ if (__kmpc_is_generic_main_thread(TID)) {
// Main thread alone, use shared memory if space is available.
if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]];
@@ -75,7 +71,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
return Ptr;
}
} else {
- int TID = GetThreadIdInBlock();
int WID = GetWarpId();
unsigned WarpBytes = Bytes * WARPSIZE;
auto AllocSharedStack = [&]() {
@@ -92,7 +87,6 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes);
}
// Fallback to malloc
- int TID = GetThreadIdInBlock();
unsigned WarpBytes = Bytes * WARPSIZE;
auto AllocGlobal = [&] {
return SafeMalloc(WarpBytes, "AllocGlobalFallback");
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
index ae0590284d06..65816aeb863d 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -68,9 +68,7 @@ EXTERN int omp_get_thread_limit(void) {
}
EXTERN int omp_get_thread_num() {
- bool isSPMDExecutionMode = __kmpc_is_spmd_exec_mode();
- int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
- int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
+ int rc = GetOmpThreadId();
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
return rc;
}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
index 709905724192..e6d392c4c64d 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -210,7 +210,7 @@ public:
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
return;
}
- int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
T tripCount = ub - lb + 1; // +1 because ub is inclusive
@@ -453,7 +453,7 @@ public:
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
- int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ int tid = GetLogicalThreadIdInBlock();
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
"current thread is not needed here; error");
// retrieve schedule
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 2a5cc312376a..153754fc3fdd 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -160,6 +160,10 @@ EXTERN int8_t __kmpc_is_spmd_exec_mode() {
return (execution_param & ModeMask) == Spmd;
}
+EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
+ return !__kmpc_is_spmd_exec_mode() && GetMasterThreadID() == Tid;
+}
+
EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
static void __kmpc_target_region_state_machine(ident_t *Ident) {
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index d404015eb881..2fd7b0585ad6 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -188,7 +188,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
}
// assume this is only called for nested parallel
- int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ int threadId = GetLogicalThreadIdInBlock();
// unlike actual parallel, threads in the same team do not share
// the workTaskDescr in this case and num threads is fixed to 1
@@ -227,7 +227,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
}
// pop stack
- int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ int threadId = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
// set new top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
@@ -249,8 +249,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
- int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
- return GetOmpThreadId(tid, __kmpc_is_spmd_exec_mode());
+ return GetOmpThreadId();
}
////////////////////////////////////////////////////////////////////////////////
@@ -262,7 +261,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
"Runtime must be initialized.");
- tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index da025f4acd11..6c02790ac0aa 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -69,7 +69,7 @@ static int32_t nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
- uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+ uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
if (NumThreads == 1)
return 1;
@@ -184,10 +184,11 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_ListGlobalFctPtr glredFct) {
// Terminate all threads in non-SPMD mode except for the master thread.
- if (!__kmpc_is_spmd_exec_mode() && GetThreadIdInBlock() != GetMasterThreadID())
+ if (!__kmpc_is_spmd_exec_mode() &&
+ !__kmpc_is_generic_main_thread(GetThreadIdInBlock()))
return 0;
- uint32_t ThreadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ uint32_t ThreadId = GetLogicalThreadIdInBlock();
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
index d711c2a9d708..f9014dfc4284 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -67,11 +67,11 @@ int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
// or a serial region by the master. If the master (whose CUDA thread
// id is GetMasterThreadID()) calls this routine, we return 0 because
// it is a shadow for the first worker.
-int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
+int GetLogicalThreadIdInBlock() {
// Implemented using control flow (predication) instead of with a modulo
// operation.
int tid = GetThreadIdInBlock();
- if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
+ if (__kmpc_is_generic_main_thread(tid))
return 0;
else
return tid;
@@ -83,16 +83,19 @@ int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
//
////////////////////////////////////////////////////////////////////////////////
-int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
+int GetOmpThreadId() {
+ int tid = GetThreadIdInBlock();
+ if (__kmpc_is_generic_main_thread(tid))
+ return 0;
// omp_thread_num
int rc;
if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
rc = 0;
- } else if (isSPMDExecutionMode) {
- rc = GetThreadIdInBlock();
+ } else if (__kmpc_is_spmd_exec_mode()) {
+ rc = tid;
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
rc = currTaskDescr->ThreadId();
}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
index 0dfbacf68051..1dcd9abfa9e6 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -47,7 +47,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
- tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ tid = GetLogicalThreadIdInBlock();
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
if (numberOfActiveOMPThreads > 1) {
diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu
index 23470e796d72..3c6020c5d6c2 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/task.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu
@@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");
// 2. push new context: update new task descriptor
- int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
@@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");
// 2. push new context: update new task descriptor
- int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
@@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
// 3... noting to call... is inline
// 4. pop context
- int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode());
+ int tid = GetLogicalThreadIdInBlock();
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
parentTaskDescr);
// 5. free
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
index 92a3f82df374..7a4e46feead2 100644
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -41,13 +41,12 @@ bool isRuntimeInitialized();
////////////////////////////////////////////////////////////////////////////////
// get global ids to locate tread/team info (constant regardless of OMP)
-int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+int GetLogicalThreadIdInBlock();
int GetMasterThreadID();
int GetNumberOfWorkersInTeam();
// get OpenMP thread and team ids
-int GetOmpThreadId(int threadId,
- bool isSPMDExecutionMode); // omp_thread_num
+int GetOmpThreadId(); // omp_thread_num
int GetOmpTeamId(); // omp_team_num
// get OpenMP number of threads and team
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index e0c433060c85..513075779972 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -449,6 +449,10 @@ EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();
+/// Return true if the hardware thread id \p Tid represents the OpenMP main
+/// thread in generic mode outside of a parallel region.
+EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);
+
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
const void *buf, size_t size,
int16_t is_shared, const void **res);
More information about the Openmp-commits
mailing list