[Openmp-commits] [openmp] [OpenMP] Team reduction work specialization (PR #70766)
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Thu Nov 2 12:53:02 PDT 2023
https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/70766
>From e28dfcee74e9b9d6aa67120fb81435b616166d66 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 30 Oct 2023 22:31:45 -0700
Subject: [PATCH] [OpenMP] Provide a specialized team reduction for the common
case
We default to < 1024 teams if the user did not specify otherwise. As
such we can avoid the extra logic in the teams reduction that handles
more than num_of_records (default 1024) teams. This is a stopgap but
still shaves of 33% of the runtime in some simple reduction examples.
---
.../libomptarget/DeviceRTL/src/Reduction.cpp | 107 ++++++++++++++++++
1 file changed, 107 insertions(+)
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index efa09cafa879ec1..51bc16bdfd18e1f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -178,11 +178,118 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
false);
}
+/// Mostly like _v2 but with the builtin assumption that we have less than
+/// num_of_records (by default 1024) teams.
+int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
+ IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
+ uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
+ InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
+ ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
+ // Terminate all threads in non-SPMD mode except for the master thread.
+ uint32_t ThreadId = mapping::getThreadIdInBlock();
+ if (mapping::isGenericMode()) {
+ if (!mapping::isMainThreadInGenericMode())
+ return 0;
+ ThreadId = 0;
+ }
+
+ uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
+
+ // In non-generic mode all workers participate in the teams reduction.
+ // In generic mode only the team master participates in the teams
+ // reduction because the workers are waiting for parallel work.
+ uint32_t NumThreads = omp_get_num_threads();
+ uint32_t TeamId = omp_get_team_num();
+ uint32_t NumTeams = omp_get_num_teams();
+ static unsigned SHARED(ChunkTeamCount);
+
+ // Block progress for teams greater than the current upper
+ // limit. We always only allow a number of teams less or equal
+ // to the number of slots in the buffer.
+ bool IsMaster = (ThreadId == 0);
+
+ if (IsMaster) {
+ lgcpyFct(GlobalBuffer, TeamId, reduce_data);
+
+ // Increment team counter.
+ // This counter is incremented by all teams in the current
+ // BUFFER_SIZE chunk.
+ ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
+ atomic::MemScopeTy::device);
+ }
+ // Synchronize
+ if (mapping::isSPMDMode())
+ synchronize::threadsAligned(atomic::acq_rel);
+ else
+ fence::kernel(atomic::acq_rel);
+
+ // reduce_data is global or shared so before being reduced within the
+ // warp we need to bring it in local memory:
+ // local_reduce_data = reduce_data[i]
+ //
+ // Example for 3 reduction variables a, b, c (of potentially different
+ // types):
+ //
+ // buffer layout (struct of arrays):
+ // a, a, ..., a, b, b, ... b, c, c, ... c
+ // |__________|
+ // number of teams
+ //
+ // local_data_reduce layout (struct):
+ // a, b, c
+ //
+ // Each thread will have a local struct containing the values to be
+ // reduced:
+ // 1. do reduction within each warp.
+ // 2. do reduction across warps.
+ // 3. write the final result to the main reduction variable
+ // by returning 1 in the thread holding the reduction result.
+
+ // Check if this is the very last team.
+ if (ChunkTeamCount != NumTeams - 1)
+ return 0;
+
+ // Last team processing.
+ NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
+ if (ThreadId >= NumThreads)
+ return 0;
+
+ // Load from buffer and reduce.
+ glcpyFct(GlobalBuffer, ThreadId, reduce_data);
+ for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
+ glredFct(GlobalBuffer, i, reduce_data);
+
+ // Reduce across warps to the warp master.
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+
+ uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
+ uint32_t WarpsNeeded =
+ (ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ if (mapping::getWarpIdInBlock() == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+
+ return IsMaster;
+}
+
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
ListGlobalFnTy glredFct) {
+ // The first check is a compile time constant, the second one a runtime check.
+ // If the first one succeeds we will use the specialized version.
+ if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
+ state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
+ num_of_records == 1024) ||
+ (omp_get_num_teams() <= num_of_records))
+ return __kmpc_nvptx_teams_reduce_nowait_v3(
+ Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
+ lgcpyFct, lgredFct, glcpyFct, glredFct);
+
// Terminate all threads in non-SPMD mode except for the master thread.
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (mapping::isGenericMode()) {
More information about the Openmp-commits
mailing list