[Openmp-commits] [llvm] [openmp] [OpenMP][offload] Inline target reductions (PR #196061)
Robert Imschweiler via Openmp-commits
openmp-commits at lists.llvm.org
Wed May 6 08:30:49 PDT 2026
https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/196061
>From 325463fa8318c972d9612394035d9e1faf77ec36 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Tue, 5 May 2026 15:44:49 -0500
Subject: [PATCH 1/3] [OpenMP][offload] Add enhanced cross-team reduction test
Tests different patterns of OpenMP cross-team reductions, for multiple
data types.
If run with `LIBOMPTARGET_INFO=16`, shows current register spilling due
to dispatch jump chains (which grow for every reduction in the same
translation unit) for indirect function calls in the reduction runtime.
---
.../test/offloading/multiple_reductions.cpp | 129 ++++++++++++++++++
1 file changed, 129 insertions(+)
create mode 100644 offload/test/offloading/multiple_reductions.cpp
diff --git a/offload/test/offloading/multiple_reductions.cpp b/offload/test/offloading/multiple_reductions.cpp
new file mode 100644
index 0000000000000..3187076d06f6c
--- /dev/null
+++ b/offload/test/offloading/multiple_reductions.cpp
@@ -0,0 +1,129 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+// UNSUPPORTED: intelgpu
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+
+#include "omp.h"
+
+#define N 10000
+
+template <typename T> static void init_data(T *a) {
+ for (int i = 0; i < N; ++i)
+ a[i] = i;
+}
+
+template <typename T> void run_type(void) {
+ T s1, s2;
+
+ T *in1 = static_cast<T *>(malloc(N * sizeof(T)));
+ T *in2 = static_cast<T *>(malloc(N * sizeof(T)));
+ assert(in1 && in2);
+
+ init_data(in1);
+ init_data(in2);
+
+#pragma omp target enter data map(to : in1[0 : N], in2[0 : N])
+
+ // Sum reduction
+ s1 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+ for (int i = 0; i < N; ++i)
+ s1 += in1[i];
+ // CHECK: 49995000
+ std::cout << s1 << '\n';
+
+ s1 = T(0);
+ // Indirect sum reduction
+ auto accumulate = [](T a, T b) { return a + b; };
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+ for (int i = 0; i < N; i++)
+ s1 = accumulate(s1, in1[i]);
+ // CHECK: 49995000
+ std::cout << s1 << '\n';
+
+ // Dot reduction
+ s1 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1)
+ for (int i = 0; i < N; ++i)
+ s1 += in1[i] * in2[i];
+ // CHECK: 2570853208
+ std::cout << s1 << '\n';
+
+ // Combined reduction (sum and max) - in the same loop ...
+ s1 = s2 = T(0);
+#pragma omp target teams distribute parallel for reduction(+ : s1) \
+ reduction(max : s2)
+ for (int i = 0; i < N; ++i) {
+ s1 += in1[i];
+ s2 = in1[i] > s2 ? in1[i] : s2;
+ }
+ // CHECK: 49995000 : 9999
+ std::cout << s1 << " : " << s2 << '\n';
+
+ // ... and in separate loops
+ s1 = s2 = T(0);
+#pragma omp target map(tofrom : s1, s2)
+#pragma omp teams reduction(+ : s1) reduction(max : s2)
+ {
+#pragma omp distribute parallel for reduction(+ : s1)
+ for (int i = 0; i < N; i++)
+ s1 += in1[i];
+
+#pragma omp distribute parallel for reduction(max : s2)
+ for (int i = 0; i < N; i++)
+ s2 = in1[i] > s2 ? in1[i] : s2;
+ }
+ // CHECK: 49995000 : 9999
+ std::cout << s1 << " : " << s2 << '\n';
+
+ // Reduction in a kernel that is also doing something completely
+ // unrelated to the reduction (pure register work, no memory ops).
+ s1 = T(0);
+#pragma omp target map(tofrom : s1)
+#pragma omp teams reduction(+ : s1)
+ {
+#pragma omp distribute parallel for reduction(+ : s1)
+ for (int i = 0; i < N; i++)
+ s1 += in1[i];
+
+ // Just do something, without actually doing anything
+#pragma omp parallel
+ {
+ int x = omp_get_thread_num();
+ for (int j = 0; j < 100; j++)
+ x = x * 0.9 + j;
+ if (x == -1)
+ s1 += x;
+ }
+ }
+ // CHECK: 49995000
+ std::cout << s1 << '\n';
+
+#pragma omp target exit data map(delete : in1[0 : N], in2[0 : N])
+
+ free(in1);
+ free(in2);
+}
+
+int main(int argc, char **argv) {
+ run_type<double>();
+ run_type<unsigned>();
+ run_type<unsigned long>();
+
+ // Reduction calculating pi
+ double pi = 0.0;
+ // https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
+#pragma omp target teams distribute parallel for reduction(+ : pi)
+ for (int i = 0; i < N; i++) {
+ double term = 1.0 / (2 * i + 1);
+ pi += (i & 0x1) ? -term : term;
+ }
+ // CHECK: 3.141
+ printf("%.3f\n", pi * 4.0);
+
+ return EXIT_SUCCESS;
+}
>From bb128b7689aedbc4a46d0578c25515ec8c0d16cc Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 6 May 2026 07:02:23 -0500
Subject: [PATCH 2/3] [OpenMP][offload] Inline target reductions
Significantly reduces register usage and removes register spilling in
`offload/test/offloading/multiple-reductions.cpp`, for example.
Provides speedup of up to 5-10x for a lot of reductions in such a larger
setup.
---
openmp/device/src/Reduction.cpp | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index f2a2d5e39aaa5..8a685d3bad885 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -22,15 +22,19 @@ using namespace ompx;
namespace {
-void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+[[clang::always_inline]]
+static void gpu_regular_warp_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct) {
for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
shflFct(reduce_data, /*LaneId - not used= */ 0,
/*Offset = */ mask, /*AlgoVersion=*/0);
}
}
-void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
- uint32_t size, uint32_t tid) {
+[[clang::always_inline]]
+static void gpu_irregular_warp_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct, uint32_t size,
+ uint32_t tid) {
uint32_t curr_size;
uint32_t mask;
curr_size = size;
@@ -42,6 +46,7 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
+[[clang::always_inline]]
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
@@ -61,6 +66,7 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
return (logical_lane_id == 0);
}
+[[clang::always_inline]]
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct) {
@@ -155,17 +161,21 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
return BlockThreadId == 0;
}
-uint32_t roundToWarpsize(uint32_t s) {
+[[clang::always_inline]]
+static uint32_t roundToWarpsize(uint32_t s) {
if (s < mapping::getWarpSize())
return 1;
return (s & ~(unsigned)(mapping::getWarpSize() - 1));
}
-uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) {
+ return x < y ? x : y;
+}
} // namespace
extern "C" {
+[[clang::always_inline]]
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
uint64_t reduce_data_size,
void *reduce_data,
@@ -174,6 +184,7 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
}
+[[clang::always_inline]]
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
>From b92afabeb1b42d621c904bf8473d2cfdd0f9af88 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 6 May 2026 10:30:20 -0500
Subject: [PATCH 3/3] remove namespace
---
openmp/device/src/Reduction.cpp | 4 ----
1 file changed, 4 deletions(-)
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index 8a685d3bad885..c719bafd81b31 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -20,8 +20,6 @@
using namespace ompx;
-namespace {
-
[[clang::always_inline]]
static void gpu_regular_warp_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
@@ -172,8 +170,6 @@ static constexpr uint32_t kmpcMin(uint32_t x, uint32_t y) {
return x < y ? x : y;
}
-} // namespace
-
extern "C" {
[[clang::always_inline]]
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
More information about the Openmp-commits
mailing list