[Openmp-commits] [llvm] [openmp] [OpenMP][offload] Inline target reductions (PR #196061)

Wed May 6 08:31:50 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-offload

Author: Robert Imschweiler (ro-i)

<details>
<summary>Changes</summary>

Significantly reduces register usage and removes register spilling in `offload/test/offloading/multiple-reductions.cpp`, for example. Provides speedup of up to 5-10x for a lot of reductions in such a larger setup.

Based on https://github.com/llvm/llvm-project/pull/195940.
See also the discussion in https://github.com/llvm/llvm-project/pull/195102.

---
Full diff: https://github.com/llvm/llvm-project/pull/196061.diff


2 Files Affected:

- (added) offload/test/offloading/multiple_reductions.cpp (+129) 
- (modified) openmp/device/src/Reduction.cpp (+16-9) 


``````````diff

diff --git a/offload/test/offloading/multiple_reductions.cpp b/offload/test/offloading/multiple_reductions.cpp
new file mode 100644
index 0000000000000..3187076d06f6c
--- /dev/null
+++ b/offload/test/offloading/multiple_reductions.cpp
@@ -0,0 +1,129 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+// UNSUPPORTED: intelgpu
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+
+#include "omp.h"
+
+#define N 10000
+
+template <typename T> static void init_data(T *a) {
+  for (int i = 0; i < N; ++i)
+    a[i] = i;
+}
+
+template <typename T> void run_type(void) {
+  T s1, s2;
+
+  T *in1 = static_cast<T *>(malloc(N * sizeof(T)));
+  T *in2 = static_cast<T *>(malloc(N * sizeof(T)));
+  assert(in1 && in2);
+
+  init_data(in1);
+  init_data(in2);
+
+#pragma omp target enter data map(to : in1[0 : N], in2[0 : N])
+
+  // Sum reduction
+  s1 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+  for (int i = 0; i < N; ++i)
+    s1 += in1[i];
+  // CHECK: 49995000
+  std::cout << s1 << '\n';
+
+  s1 = T(0);
+  // Indirect sum reduction
+  auto accumulate = [](T a, T b) { return a + b; };
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+  for (int i = 0; i < N; i++)
+    s1 = accumulate(s1, in1[i]);
+  // CHECK: 49995000
+  std::cout << s1 << '\n';
+
+  // Dot reduction
+  s1 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+  for (int i = 0; i < N; ++i)
+    s1 += in1[i] * in2[i];
+  // CHECK: 2570853208
+  std::cout << s1 << '\n';
+
+  // Combined reduction (sum and max) - in the same loop ...
+  s1 = s2 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)             \
+    reduction(max : s2)
+  for (int i = 0; i < N; ++i) {
+    s1 += in1[i];
+    s2 = in1[i] > s2 ? in1[i] : s2;
+  }
+  // CHECK: 49995000 : 9999
+  std::cout << s1 << " : " << s2 << '\n';
+
+  // ... and in separate loops
+  s1 = s2 = T(0);
+#pragma omp target map(tofrom : s1, s2)
+#pragma omp teams reduction(+ : s1) reduction(max : s2)
+  {
+#pragma omp distribute parallel for reduction(+ : s1)
+    for (int i = 0; i < N; i++)
+      s1 += in1[i];
+
+#pragma omp distribute parallel for reduction(max : s2)
+    for (int i = 0; i < N; i++)
+      s2 = in1[i] > s2 ? in1[i] : s2;
+  }
+  // CHECK: 49995000 : 9999
+  std::cout << s1 << " : " << s2 << '\n';
+
+  // Reduction in a kernel that is also doing something completely
+  // unrelated to the reduction (pure register work, no memory ops).
+  s1 = T(0);
+#pragma omp target map(tofrom : s1)
+#pragma omp teams reduction(+ : s1)
+  {
+#pragma omp distribute parallel for reduction(+ : s1)
+    for (int i = 0; i < N; i++)
+      s1 += in1[i];
+
+    // Just do something, without actually doing anything
+#pragma omp parallel
+    {
+      int x = omp_get_thread_num();
+      for (int j = 0; j < 100; j++)
+        x = x * 0.9 + j;
+      if (x == -1)
+        s1 += x;
+    }
+  }
+  // CHECK: 49995000
+  std::cout << s1 << '\n';
+
+#pragma omp target exit data map(delete : in1[0 : N], in2[0 : N])
+
+  free(in1);
+  free(in2);
+}
+
+int main(int argc, char **argv) {
+  run_type<double>();
+  run_type<unsigned>();
+  run_type<unsigned long>();
+
+  // Reduction calculating pi
+  double pi = 0.0;
+  // https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
+#pragma omp target teams distribute parallel for reduction(+ : pi)
+  for (int i = 0; i < N; i++) {
+    double term = 1.0 / (2 * i + 1);
+    pi += (i & 0x1) ? -term : term;
+  }
+  // CHECK: 3.141
+  printf("%.3f\n", pi * 4.0);
+
+  return EXIT_SUCCESS;
+}
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index f2a2d5e39aaa5..c719bafd81b31 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -20,17 +20,19 @@
 
 using namespace ompx;
 
-namespace {
-
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+[[clang::always_inline]]
+static void gpu_regular_warp_reduce(void *reduce_data,
+                                    ShuffleReductFnTy shflFct) {
   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
     shflFct(reduce_data, /*LaneId - not used= */ 0,
             /*Offset = */ mask, /*AlgoVersion=*/0);
   }
 }
 
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
-                               uint32_t size, uint32_t tid) {
+[[clang::always_inline]]
+static void gpu_irregular_warp_reduce(void *reduce_data,
+                                      ShuffleReductFnTy shflFct, uint32_t size,
+                                      uint32_t tid) {
   uint32_t curr_size;
   uint32_t mask;
   curr_size = size;
@@ -42,6 +44,7 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
   }
 }
 
+[[clang::always_inline]]
 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
                                           ShuffleReductFnTy shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -61,6 +64,7 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   return (logical_lane_id == 0);
 }
 
+[[clang::always_inline]]
 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
                                             InterWarpCopyFnTy cpyFct) {
@@ -155,17 +159,19 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   return BlockThreadId == 0;
 }
 
-uint32_t roundToWarpsize(uint32_t s) {
+[[clang::always_inline]]
+static uint32_t roundToWarpsize(uint32_t s) {
   if (s < mapping::getWarpSize())
     return 1;
   return (s & ~(unsigned)(mapping::getWarpSize() - 1));
 }
 
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-} // namespace
+static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) {
+  return x < y ? x : y;
+}
 
 extern "C" {
+[[clang::always_inline]]
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
                                                uint64_t reduce_data_size,
                                                void *reduce_data,
@@ -174,6 +180,7 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
   return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
 }
 
+[[clang::always_inline]]
 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
     uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,

``````````

</details>


https://github.com/llvm/llvm-project/pull/196061