[llvm-branch-commits] [openmp] [OpenMP][offload] Inline target reductions (PR #196061)

Wed May 6 05:52:16 PDT 2026

https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/196061

Significantly reduces register usage and removes register spilling in `offload/test/offloading/multiple-reductions.cpp`, for example. Provides speedup of up to 5-10x for a lot of reductions in such a larger setup.

Based on https://github.com/llvm/llvm-project/pull/195940.
See also the discussion in https://github.com/llvm/llvm-project/pull/195102.

>From bb128b7689aedbc4a46d0578c25515ec8c0d16cc Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 6 May 2026 07:02:23 -0500
Subject: [PATCH] [OpenMP][offload] Inline target reductions

Significantly reduces register usage and removes register spilling in
`offload/test/offloading/multiple-reductions.cpp`, for example.
Provides speedup of up to 5-10x for a lot of reductions in such a larger
setup.
---
 openmp/device/src/Reduction.cpp | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index f2a2d5e39aaa5..8a685d3bad885 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -22,15 +22,19 @@ using namespace ompx;
 
 namespace {
 
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+[[clang::always_inline]]
+static void gpu_regular_warp_reduce(void *reduce_data,
+                                    ShuffleReductFnTy shflFct) {
   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
     shflFct(reduce_data, /*LaneId - not used= */ 0,
             /*Offset = */ mask, /*AlgoVersion=*/0);
   }
 }
 
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
-                               uint32_t size, uint32_t tid) {
+[[clang::always_inline]]
+static void gpu_irregular_warp_reduce(void *reduce_data,
+                                      ShuffleReductFnTy shflFct, uint32_t size,
+                                      uint32_t tid) {
   uint32_t curr_size;
   uint32_t mask;
   curr_size = size;
@@ -42,6 +46,7 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
   }
 }
 
+[[clang::always_inline]]
 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
                                           ShuffleReductFnTy shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -61,6 +66,7 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   return (logical_lane_id == 0);
 }
 
+[[clang::always_inline]]
 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
                                             InterWarpCopyFnTy cpyFct) {
@@ -155,17 +161,21 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   return BlockThreadId == 0;
 }
 
-uint32_t roundToWarpsize(uint32_t s) {
+[[clang::always_inline]]
+static uint32_t roundToWarpsize(uint32_t s) {
   if (s < mapping::getWarpSize())
     return 1;
   return (s & ~(unsigned)(mapping::getWarpSize() - 1));
 }
 
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) {
+  return x < y ? x : y;
+}
 
 } // namespace
 
 extern "C" {
+[[clang::always_inline]]
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
                                                uint64_t reduce_data_size,
                                                void *reduce_data,
@@ -174,6 +184,7 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
   return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
 }
 
+[[clang::always_inline]]
 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
     uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,