[Openmp-commits] [llvm] [openmp] [offload][OpenMP] Fix partial warp reduction (PR #205861)
Robert Imschweiler via Openmp-commits
openmp-commits at lists.llvm.org
Thu Jun 25 11:46:33 PDT 2026
https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/205861
>From 2de5eb41c77a0ef8dcfcea7a7b86a82e0ef444b6 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 25 Jun 2026 12:00:23 -0500
Subject: [PATCH 1/3] [offload][OpenMP] Fix partial warp reduction
Don't use a full warp for the final cross-team reduction if the
reduction's thread limit is below the warp size.
Claude assisted with the test.
---
.../xteam_min_reduction_partial_wave.c | 41 +++++++++++++++++++
openmp/device/src/Reduction.cpp | 2 +-
2 files changed, 42 insertions(+), 1 deletion(-)
create mode 100644 offload/test/offloading/xteam_min_reduction_partial_wave.c
diff --git a/offload/test/offloading/xteam_min_reduction_partial_wave.c b/offload/test/offloading/xteam_min_reduction_partial_wave.c
new file mode 100644
index 0000000000000..df2b4e828288a
--- /dev/null
+++ b/offload/test/offloading/xteam_min_reduction_partial_wave.c
@@ -0,0 +1,41 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compileopt-run-and-check-generic
+
+// REQUIRES: gpu
+// UNSUPPORTED: intelgpu
+
+// Regression test for a cross-team reduction bug where the final xteam
+// reduction used a full-wave reduction path even though the kernel was launched
+// with fewer threads than the device wave size. On wave64 AMDGPU targets,
+// thread_limit(32) creates a partial wave. The 63-team case uses the
+// single-thread final reduction path; the 64-team case crosses the wave-size
+// boundary and must still ignore inactive lanes.
+
+#include <limits.h>
+#include <stdio.h>
+
+static unsigned reduce_min(int teams, int seed) {
+ unsigned min_val = UINT_MAX;
+
+#pragma omp target teams distribute parallel for num_teams(teams) \
+ thread_limit(32) map(to : seed) reduction(min : min_val)
+ for (int i = 0; i < 2017; ++i) {
+ unsigned val = 0xdeadbeefU + ((i + seed) & 1);
+ if (val < min_val)
+ min_val = val;
+ }
+
+ return min_val;
+}
+
+int main(int argc, char **argv) {
+ unsigned min63 = reduce_min(63, argc);
+ unsigned min64 = reduce_min(64, argc);
+
+ // CHECK: min63 = 0xdeadbeef
+ // CHECK: min64 = 0xdeadbeef
+ printf("min63 = %#x\n", min63);
+ printf("min64 = %#x\n", min64);
+
+ return min63 == 0xdeadbeefU && min64 == 0xdeadbeefU ? 0 : 1;
+}
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index ec772d357a425..df444fb9d90e4 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -280,7 +280,7 @@ int32_t __kmpc_gpu_xteam_reduce_nowait(IdentTy *Loc, void *reduce_data,
return 0;
// The last team performs final reduction across all team values.
- NumThreads = kmpc_min(NumThreads, round_down_to_warpsize(NumTeams));
+ NumThreads = round_down_to_warpsize(kmpc_min(NumThreads, NumTeams));
if (ThreadId >= NumThreads)
return 0;
>From f775aed0b30cf381793e9d3c0f45f601427ab03a Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 25 Jun 2026 13:35:10 -0500
Subject: [PATCH 2/3] make test clearer and a bit better
---
.../xteam_min_reduction_partial_wave.c | 37 ++++++++++++-------
1 file changed, 23 insertions(+), 14 deletions(-)
diff --git a/offload/test/offloading/xteam_min_reduction_partial_wave.c b/offload/test/offloading/xteam_min_reduction_partial_wave.c
index df2b4e828288a..f6322d1b13e1e 100644
--- a/offload/test/offloading/xteam_min_reduction_partial_wave.c
+++ b/offload/test/offloading/xteam_min_reduction_partial_wave.c
@@ -1,8 +1,9 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: %libomptarget-compileopt-generic
+// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
-// REQUIRES: gpu
-// UNSUPPORTED: intelgpu
+// REQUIRES: amdgpu
// Regression test for a cross-team reduction bug where the final xteam
// reduction used a full-wave reduction path even though the kernel was launched
@@ -14,13 +15,19 @@
#include <limits.h>
#include <stdio.h>
-static unsigned reduce_min(int teams, int seed) {
+#define THREAD_LIMIT 32
+#define TEAMS_BELOW_WAVE_SIZE_BOUNDARY 63
+#define TEAMS_AT_WAVE_SIZE_BOUNDARY 64
+#define NUM_ITERS (THREAD_LIMIT * TEAMS_BELOW_WAVE_SIZE_BOUNDARY + 1)
+#define EXPECTED_MIN UINT_MAX
+
+static unsigned reduce_min(int teams) {
unsigned min_val = UINT_MAX;
#pragma omp target teams distribute parallel for num_teams(teams) \
- thread_limit(32) map(to : seed) reduction(min : min_val)
- for (int i = 0; i < 2017; ++i) {
- unsigned val = 0xdeadbeefU + ((i + seed) & 1);
+ thread_limit(THREAD_LIMIT) reduction(min : min_val)
+ for (int i = 0; i < NUM_ITERS; ++i) {
+ unsigned val = EXPECTED_MIN;
if (val < min_val)
min_val = val;
}
@@ -28,14 +35,16 @@ static unsigned reduce_min(int teams, int seed) {
return min_val;
}
-int main(int argc, char **argv) {
- unsigned min63 = reduce_min(63, argc);
- unsigned min64 = reduce_min(64, argc);
+int main(void) {
+ unsigned min63 = reduce_min(TEAMS_BELOW_WAVE_SIZE_BOUNDARY);
+ unsigned min64 = reduce_min(TEAMS_AT_WAVE_SIZE_BOUNDARY);
- // CHECK: min63 = 0xdeadbeef
- // CHECK: min64 = 0xdeadbeef
+ // CHECK: Launching kernel {{.*}} with [63,1,1] blocks and [32,1,1] threads in SPMD mode
+ // CHECK: Launching kernel {{.*}} with [64,1,1] blocks and [32,1,1] threads in SPMD mode
+ // CHECK: min63 = 0xffffffff
+ // CHECK: min64 = 0xffffffff
printf("min63 = %#x\n", min63);
printf("min64 = %#x\n", min64);
- return min63 == 0xdeadbeefU && min64 == 0xdeadbeefU ? 0 : 1;
+ return min63 == EXPECTED_MIN && min64 == EXPECTED_MIN ? 0 : 1;
}
>From 81b21deb507c6e1519e0548a8c0f5c2fd708c4e5 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 25 Jun 2026 13:45:17 -0500
Subject: [PATCH 3/3] fix false positive clang-format suggestions
---
.../offloading/xteam_min_reduction_partial_wave.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/offload/test/offloading/xteam_min_reduction_partial_wave.c b/offload/test/offloading/xteam_min_reduction_partial_wave.c
index f6322d1b13e1e..11c45c7726ce1 100644
--- a/offload/test/offloading/xteam_min_reduction_partial_wave.c
+++ b/offload/test/offloading/xteam_min_reduction_partial_wave.c
@@ -1,7 +1,9 @@
// RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
// RUN: %libomptarget-compileopt-generic
-// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
// REQUIRES: amdgpu
@@ -39,8 +41,10 @@ int main(void) {
unsigned min63 = reduce_min(TEAMS_BELOW_WAVE_SIZE_BOUNDARY);
unsigned min64 = reduce_min(TEAMS_AT_WAVE_SIZE_BOUNDARY);
- // CHECK: Launching kernel {{.*}} with [63,1,1] blocks and [32,1,1] threads in SPMD mode
- // CHECK: Launching kernel {{.*}} with [64,1,1] blocks and [32,1,1] threads in SPMD mode
+ // CHECK: Launching kernel {{.*}} with [63,1,1] blocks and [32,1,1] threads
+ // CHECK-SAME: in SPMD mode
+ // CHECK: Launching kernel {{.*}} with [64,1,1] blocks and [32,1,1] threads
+ // CHECK-SAME: in SPMD mode
// CHECK: min63 = 0xffffffff
// CHECK: min64 = 0xffffffff
printf("min63 = %#x\n", min63);
More information about the Openmp-commits
mailing list