[Openmp-commits] [openmp] e9a48f9 - [OpenMP] Simplify parallel reductions (#70983)
via Openmp-commits
openmp-commits at lists.llvm.org
Thu Nov 2 15:50:09 PDT 2023
Author: Johannes Doerfert
Date: 2023-11-02T15:50:05-07:00
New Revision: e9a48f9e05c103a235993c6b15a2c36442a2ddc1
URL: https://github.com/llvm/llvm-project/commit/e9a48f9e05c103a235993c6b15a2c36442a2ddc1
DIFF: https://github.com/llvm/llvm-project/commit/e9a48f9e05c103a235993c6b15a2c36442a2ddc1.diff
LOG: [OpenMP] Simplify parallel reductions (#70983)
A lot of the code was from a time when we had multiple parallel levels.
The new runtime is much simpler, the code can be simplified a lot which
should speed up reductions too.
Added:
openmp/libomptarget/test/offloading/generic_reduction.c
Modified:
openmp/libomptarget/DeviceRTL/src/Reduction.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index 9a30ded95b0cd77..49687f365b9228f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -44,119 +44,45 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
- ShuffleReductFnTy shflFct) {
- uint32_t size, remote_id, physical_lane_id;
- physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
- __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
- __kmpc_impl_lanemask_t Liveness = mapping::activemask();
- uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
- __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
- do {
- Liveness = mapping::activemask();
- remote_id = utils::ffs(Liveness & lanemask_gt);
- size = utils::popc(Liveness);
- logical_lane_id /= 2;
- shflFct(reduce_data, /*LaneId =*/logical_lane_id,
- /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
- } while (logical_lane_id % 2 == 0 && size > 1);
- return (logical_lane_id == 0);
-}
-#endif
-
-static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
- uint64_t reduce_size,
- void *reduce_data,
+static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct,
- bool isSPMDExecutionMode, bool) {
- uint32_t BlockThreadId = mapping::getThreadIdInBlock();
- if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
- BlockThreadId = 0;
+ InterWarpCopyFnTy cpyFct) {
uint32_t NumThreads = omp_get_num_threads();
+ // Handle degenerated parallel regions, including all nested ones, first.
if (NumThreads == 1)
return 1;
- /*
- * This reduce function handles reduction within a team. It handles
- * parallel regions in both L1 and L2 parallelism levels. It also
- * supports Generic, SPMD, and NoOMP modes.
- *
- * 1. Reduce within a warp.
- * 2. Warp master copies value to warp 0 via shared memory.
- * 3. Warp 0 reduces to a single value.
- * 4. The reduced value is available in the thread that returns 1.
- */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- uint32_t WarpsNeeded =
+
+ /*
+ * 1. Reduce within a warp.
+ * 2. Warp master copies value to warp 0 via shared memory.
+ * 3. Warp 0 reduces to a single value.
+ * 4. The reduced value is available in the thread that returns 1.
+ */
+
+ uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+ uint32_t NumWarps =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- uint32_t WarpId = mapping::getWarpIdInBlock();
- // Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
- if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/NumThreads % mapping::getWarpSize(),
- /*LaneId=*/mapping::getThreadIdInBlock() %
- mapping::getWarpSize());
-
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > mapping::getWarpSize()) {
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
- }
- return BlockThreadId == 0;
-#else
- __kmpc_impl_lanemask_t Liveness = mapping::activemask();
- if (Liveness == lanes::All) // Full warp
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/utils::popc(Liveness),
- /*LaneId=*/mapping::getThreadIdInBlock() %
- mapping::getWarpSize());
- else { // Dispersed lanes. Only threads in L2
- // parallel region may enter here; return
- // early.
- return gpu_irregular_simd_reduce(reduce_data, shflFct);
- }
+ gpu_regular_warp_reduce(reduce_data, shflFct);
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
- uint32_t WarpsNeeded =
- (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
// Gather all the reduced values from each warp
// to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
+ cpyFct(reduce_data, NumWarps);
- uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
-
- return BlockThreadId == 0;
+ if (BlockThreadId < mapping::getWarpSize())
+ gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
}
- // Get the OMP thread Id. This is
diff erent from BlockThreadId in the case of
- // an L2 parallel region.
- return TId == 0;
-#endif // __CUDA_ARCH__ >= 700
+ // In Generic and in SPMD mode block thread Id 0 is what we want.
+ // It's either the main thread in SPMD mode or the "acting" main thread in the
+ // parallel region.
+ return BlockThreadId == 0;
}
uint32_t roundToWarpsize(uint32_t s) {
@@ -173,9 +99,7 @@ extern "C" {
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
- return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
- shflFct, cpyFct, mapping::isSPMDMode(),
- false);
+ return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
}
/// Mostly like _v2 but with the builtin assumption that we have less than
diff --git a/openmp/libomptarget/test/offloading/generic_reduction.c b/openmp/libomptarget/test/offloading/generic_reduction.c
new file mode 100644
index 000000000000000..8b5ff0f067f9725
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/generic_reduction.c
@@ -0,0 +1,25 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+__attribute__((optnone)) void optnone(void) {}
+
+int main() {
+ int sum = 0, nt;
+#pragma omp target teams map(tofrom : sum, nt) num_teams(1)
+ {
+ nt = 3 * omp_get_max_threads();
+ optnone();
+#pragma omp parallel reduction(+ : sum)
+ sum += 1;
+#pragma omp parallel reduction(+ : sum)
+ sum += 1;
+#pragma omp parallel reduction(+ : sum)
+ sum += 1;
+ }
+ // CHECK: nt: [[NT:.*]]
+ // CHECK: sum: [[NT]]
+ printf("nt: %i\n", nt);
+ printf("sum: %i\n", sum);
+}
More information about the Openmp-commits
mailing list