[Openmp-commits] [llvm] [openmp] [OpenMP][offload] Inline target reductions (PR #196061)

Fri May 8 02:30:09 PDT 2026

https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/196061

>From 325463fa8318c972d9612394035d9e1faf77ec36 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Tue, 5 May 2026 15:44:49 -0500
Subject: [PATCH 1/4] [OpenMP][offload] Add enhanced cross-team reduction test

Tests different patterns of OpenMP cross-team reductions, for multiple
data types.
If run with `LIBOMPTARGET_INFO=16`, shows current register spilling due
to dispatch jump chains (which grow for every reduction in the same
translation unit) for indirect function calls in the reduction runtime.
---
 .../test/offloading/multiple_reductions.cpp   | 129 ++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 offload/test/offloading/multiple_reductions.cpp

diff --git a/offload/test/offloading/multiple_reductions.cpp b/offload/test/offloading/multiple_reductions.cpp
new file mode 100644
index 0000000000000..3187076d06f6c
--- /dev/null
+++ b/offload/test/offloading/multiple_reductions.cpp
@@ -0,0 +1,129 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+// UNSUPPORTED: intelgpu
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+
+#include "omp.h"
+
+#define N 10000
+
+template <typename T> static void init_data(T *a) {
+  for (int i = 0; i < N; ++i)
+    a[i] = i;
+}
+
+template <typename T> void run_type(void) {
+  T s1, s2;
+
+  T *in1 = static_cast<T *>(malloc(N * sizeof(T)));
+  T *in2 = static_cast<T *>(malloc(N * sizeof(T)));
+  assert(in1 && in2);
+
+  init_data(in1);
+  init_data(in2);
+
+#pragma omp target enter data map(to : in1[0 : N], in2[0 : N])
+
+  // Sum reduction
+  s1 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+  for (int i = 0; i < N; ++i)
+    s1 += in1[i];
+  // CHECK: 49995000
+  std::cout << s1 << '\n';
+
+  s1 = T(0);
+  // Indirect sum reduction
+  auto accumulate = [](T a, T b) { return a + b; };
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+  for (int i = 0; i < N; i++)
+    s1 = accumulate(s1, in1[i]);
+  // CHECK: 49995000
+  std::cout << s1 << '\n';
+
+  // Dot reduction
+  s1 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+  for (int i = 0; i < N; ++i)
+    s1 += in1[i] * in2[i];
+  // CHECK: 2570853208
+  std::cout << s1 << '\n';
+
+  // Combined reduction (sum and max) - in the same loop ...
+  s1 = s2 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)             \
+    reduction(max : s2)
+  for (int i = 0; i < N; ++i) {
+    s1 += in1[i];
+    s2 = in1[i] > s2 ? in1[i] : s2;
+  }
+  // CHECK: 49995000 : 9999
+  std::cout << s1 << " : " << s2 << '\n';
+
+  // ... and in separate loops
+  s1 = s2 = T(0);
+#pragma omp target map(tofrom : s1, s2)
+#pragma omp teams reduction(+ : s1) reduction(max : s2)
+  {
+#pragma omp distribute parallel for reduction(+ : s1)
+    for (int i = 0; i < N; i++)
+      s1 += in1[i];
+
+#pragma omp distribute parallel for reduction(max : s2)
+    for (int i = 0; i < N; i++)
+      s2 = in1[i] > s2 ? in1[i] : s2;
+  }
+  // CHECK: 49995000 : 9999
+  std::cout << s1 << " : " << s2 << '\n';
+
+  // Reduction in a kernel that is also doing something completely
+  // unrelated to the reduction (pure register work, no memory ops).
+  s1 = T(0);
+#pragma omp target map(tofrom : s1)
+#pragma omp teams reduction(+ : s1)
+  {
+#pragma omp distribute parallel for reduction(+ : s1)
+    for (int i = 0; i < N; i++)
+      s1 += in1[i];
+
+    // Just do something, without actually doing anything
+#pragma omp parallel
+    {
+      int x = omp_get_thread_num();
+      for (int j = 0; j < 100; j++)
+        x = x * 0.9 + j;
+      if (x == -1)
+        s1 += x;
+    }
+  }
+  // CHECK: 49995000
+  std::cout << s1 << '\n';
+
+#pragma omp target exit data map(delete : in1[0 : N], in2[0 : N])
+
+  free(in1);
+  free(in2);
+}
+
+int main(int argc, char **argv) {
+  run_type<double>();
+  run_type<unsigned>();
+  run_type<unsigned long>();
+
+  // Reduction calculating pi
+  double pi = 0.0;
+  // https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
+#pragma omp target teams distribute parallel for reduction(+ : pi)
+  for (int i = 0; i < N; i++) {
+    double term = 1.0 / (2 * i + 1);
+    pi += (i & 0x1) ? -term : term;
+  }
+  // CHECK: 3.141
+  printf("%.3f\n", pi * 4.0);
+
+  return EXIT_SUCCESS;
+}

>From bb128b7689aedbc4a46d0578c25515ec8c0d16cc Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 6 May 2026 07:02:23 -0500
Subject: [PATCH 2/4] [OpenMP][offload] Inline target reductions

Significantly reduces register usage and removes register spilling in
`offload/test/offloading/multiple-reductions.cpp`, for example.
Provides speedup of up to 5-10x for a lot of reductions in such a larger
setup.
---
 openmp/device/src/Reduction.cpp | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index f2a2d5e39aaa5..8a685d3bad885 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -22,15 +22,19 @@ using namespace ompx;
 
 namespace {
 
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+[[clang::always_inline]]
+static void gpu_regular_warp_reduce(void *reduce_data,
+                                    ShuffleReductFnTy shflFct) {
   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
     shflFct(reduce_data, /*LaneId - not used= */ 0,
             /*Offset = */ mask, /*AlgoVersion=*/0);
   }
 }
 
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
-                               uint32_t size, uint32_t tid) {
+[[clang::always_inline]]
+static void gpu_irregular_warp_reduce(void *reduce_data,
+                                      ShuffleReductFnTy shflFct, uint32_t size,
+                                      uint32_t tid) {
   uint32_t curr_size;
   uint32_t mask;
   curr_size = size;
@@ -42,6 +46,7 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
   }
 }
 
+[[clang::always_inline]]
 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
                                           ShuffleReductFnTy shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -61,6 +66,7 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   return (logical_lane_id == 0);
 }
 
+[[clang::always_inline]]
 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
                                             InterWarpCopyFnTy cpyFct) {
@@ -155,17 +161,21 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   return BlockThreadId == 0;
 }
 
-uint32_t roundToWarpsize(uint32_t s) {
+[[clang::always_inline]]
+static uint32_t roundToWarpsize(uint32_t s) {
   if (s < mapping::getWarpSize())
     return 1;
   return (s & ~(unsigned)(mapping::getWarpSize() - 1));
 }
 
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) {
+  return x < y ? x : y;
+}
 
 } // namespace
 
 extern "C" {
+[[clang::always_inline]]
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
                                                uint64_t reduce_data_size,
                                                void *reduce_data,
@@ -174,6 +184,7 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
   return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
 }
 
+[[clang::always_inline]]
 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
     uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,

>From b92afabeb1b42d621c904bf8473d2cfdd0f9af88 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 6 May 2026 10:30:20 -0500
Subject: [PATCH 3/4] remove namespace

---
 openmp/device/src/Reduction.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index 8a685d3bad885..c719bafd81b31 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -20,8 +20,6 @@
 
 using namespace ompx;
 
-namespace {
-
 [[clang::always_inline]]
 static void gpu_regular_warp_reduce(void *reduce_data,
                                     ShuffleReductFnTy shflFct) {
@@ -172,8 +170,6 @@ static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) {
   return x < y ? x : y;
 }
 
-} // namespace
-
 extern "C" {
 [[clang::always_inline]]
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,

>From 8484aeac8cc5a68391c9e353648771b3690ed07f Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Fri, 8 May 2026 04:29:31 -0500
Subject: [PATCH 4/4] remove unneeded always_inline instances

---
 openmp/device/src/Reduction.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index c719bafd81b31..109e1127bde3d 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -20,7 +20,6 @@
 
 using namespace ompx;
 
-[[clang::always_inline]]
 static void gpu_regular_warp_reduce(void *reduce_data,
                                     ShuffleReductFnTy shflFct) {
   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
@@ -29,7 +28,6 @@ static void gpu_regular_warp_reduce(void *reduce_data,
   }
 }
 
-[[clang::always_inline]]
 static void gpu_irregular_warp_reduce(void *reduce_data,
                                       ShuffleReductFnTy shflFct, uint32_t size,
                                       uint32_t tid) {
@@ -44,7 +42,6 @@ static void gpu_irregular_warp_reduce(void *reduce_data,
   }
 }
 
-[[clang::always_inline]]
 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
                                           ShuffleReductFnTy shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -64,7 +61,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   return (logical_lane_id == 0);
 }
 
-[[clang::always_inline]]
 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
                                             InterWarpCopyFnTy cpyFct) {
@@ -159,7 +155,6 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   return BlockThreadId == 0;
 }
 
-[[clang::always_inline]]
 static uint32_t roundToWarpsize(uint32_t s) {
   if (s < mapping::getWarpSize())
     return 1;