[Openmp-commits] [openmp] 6e574f1 - Revert "[OpenMP] Provide a specialized team reduction for the common case (#70766)"
Shilei Tian via Openmp-commits
openmp-commits at lists.llvm.org
Tue Nov 7 16:17:01 PST 2023
Author: Shilei Tian
Date: 2023-11-07T19:16:44-05:00
New Revision: 6e574f125d47fc148c6312e46158e202ab3010eb
URL: https://github.com/llvm/llvm-project/commit/6e574f125d47fc148c6312e46158e202ab3010eb
DIFF: https://github.com/llvm/llvm-project/commit/6e574f125d47fc148c6312e46158e202ab3010eb.diff
LOG: Revert "[OpenMP] Provide a specialized team reduction for the common case (#70766)"
This reverts commit eab828d46c2fb7613df0bc44d34ff89702ffcc80.
Added:
Modified:
openmp/libomptarget/DeviceRTL/src/Reduction.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index 41e1dd5180613f5..cc9a01ab7589026 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -176,109 +176,11 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
}
-/// Mostly like _v2 but with the builtin assumption that we have less than
-/// num_of_records (by default 1024) teams.
-int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
- IdentTy *Loc, void *__restrict__ GlobalBuffer, uint32_t num_of_records,
- uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
- ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
- // Terminate all threads in non-SPMD mode except for the main thread.
- uint32_t ThreadId = mapping::getThreadIdInBlock();
- if (mapping::isGenericMode()) {
- if (!mapping::isMainThreadInGenericMode())
- return 0;
- ThreadId = 0;
- }
-
- uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
-
- // In non-generic mode all workers participate in the teams reduction.
- // In generic mode only the team main participates in the teams
- // reduction because the workers are waiting for parallel work.
- uint32_t NumThreads = omp_get_num_threads();
- uint32_t TeamId = omp_get_team_num();
- uint32_t NumTeams = omp_get_num_teams();
- static unsigned SHARED(ChunkTeamCount);
-
- // Block progress for teams greater than the current upper
- // limit. We always only allow a number of teams less or equal
- // to the number of slots in the buffer.
- bool IsMain = (ThreadId == 0);
-
- if (IsMain) {
- lgcpyFct(GlobalBuffer, TeamId, reduce_data);
-
- // Propagate the memory writes above to the world.
- fence::kernel(atomic::release);
-
- // Increment team counter.
- // This counter is incremented by all teams in the current
- // BUFFER_SIZE chunk.
- ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
- atomic::MemScopeTy::device);
- }
-
- // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
- // state machine.
- if (mapping::isSPMDMode())
- synchronize::threadsAligned(atomic::acq_rel);
-
- // Each thread will have a local struct containing the values to be
- // reduced:
- // 1. do reduction within each warp.
- // 2. do reduction across warps.
- // 3. write the final result to the main reduction variable
- // by returning 1 in the thread holding the reduction result.
-
- // Check if this is the very last team.
- if (ChunkTeamCount != NumTeams - 1)
- return 0;
-
- // Last team processing.
- NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
- if (ThreadId >= NumThreads)
- return 0;
-
- // Ensure we see the global memory writes by other teams
- fence::kernel(atomic::aquire);
-
- // Load from buffer and reduce.
- glcpyFct(GlobalBuffer, ThreadId, reduce_data);
- for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
- glredFct(GlobalBuffer, i, reduce_data);
-
- // Reduce across warps to the warp main.
- gpu_regular_warp_reduce(reduce_data, shflFct);
-
- uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
- uint32_t WarpsNeeded =
- (ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- if (mapping::getWarpIdInBlock() == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
-
- return IsMain;
-}
-
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
- IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
- uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
- ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
- // The first check is a compile time constant, the second one a runtime check.
- // If the first one succeeds we will use the specialized version.
- if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
- state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
- num_of_records == 1024) ||
- (omp_get_num_teams() <= num_of_records))
- return __kmpc_nvptx_teams_reduce_nowait_v3(
- Loc, GlobalBuffer, num_of_records, reduce_data_size, reduce_data,
- shflFct, cpyFct, lgcpyFct, lgredFct, glcpyFct, glredFct);
-
+ IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
+ void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+ ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+ ListGlobalFnTy glredFct) {
// Terminate all threads in non-SPMD mode except for the master thread.
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (mapping::isGenericMode()) {
More information about the Openmp-commits
mailing list