[llvm-branch-commits] [openmp] [OpenMP][offload] Inline target reductions (PR #196061)
Robert Imschweiler via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed May 6 05:52:16 PDT 2026
https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/196061
Significantly reduces register usage and removes register spilling in `offload/test/offloading/multiple-reductions.cpp`, for example. Provides speedup of up to 5-10x for a lot of reductions in such a larger setup.
Based on https://github.com/llvm/llvm-project/pull/195940.
See also the discussion in https://github.com/llvm/llvm-project/pull/195102.
>From bb128b7689aedbc4a46d0578c25515ec8c0d16cc Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 6 May 2026 07:02:23 -0500
Subject: [PATCH] [OpenMP][offload] Inline target reductions
Significantly reduces register usage and removes register spilling in
`offload/test/offloading/multiple-reductions.cpp`, for example.
Provides speedup of up to 5-10x for a lot of reductions in such a larger
setup.
---
openmp/device/src/Reduction.cpp | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index f2a2d5e39aaa5..8a685d3bad885 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -22,15 +22,19 @@ using namespace ompx;
namespace {
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+[[clang::always_inline]]
+static void gpu_regular_warp_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct) {
for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
shflFct(reduce_data, /*LaneId - not used= */ 0,
/*Offset = */ mask, /*AlgoVersion=*/0);
}
}
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
- uint32_t size, uint32_t tid) {
+[[clang::always_inline]]
+static void gpu_irregular_warp_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct, uint32_t size,
+ uint32_t tid) {
uint32_t curr_size;
uint32_t mask;
curr_size = size;
@@ -42,6 +46,7 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
+[[clang::always_inline]]
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
@@ -61,6 +66,7 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
return (logical_lane_id == 0);
}
+[[clang::always_inline]]
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct) {
@@ -155,17 +161,21 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
return BlockThreadId == 0;
}
-uint32_t roundToWarpsize(uint32_t s) {
+[[clang::always_inline]]
+static uint32_t roundToWarpsize(uint32_t s) {
if (s < mapping::getWarpSize())
return 1;
return (s & ~(unsigned)(mapping::getWarpSize() - 1));
}
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) {
+ return x < y ? x : y;
+}
} // namespace
extern "C" {
+[[clang::always_inline]]
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
uint64_t reduce_data_size,
void *reduce_data,
@@ -174,6 +184,7 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
}
+[[clang::always_inline]]
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
More information about the llvm-branch-commits
mailing list