[Openmp-commits] [llvm] [openmp] [offload][OpenMP] Fix partial warp reduction (PR #205861)

Robert Imschweiler via Openmp-commits openmp-commits at lists.llvm.org
Thu Jun 25 11:46:33 PDT 2026


https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/205861

>From 2de5eb41c77a0ef8dcfcea7a7b86a82e0ef444b6 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 25 Jun 2026 12:00:23 -0500
Subject: [PATCH 1/3] [offload][OpenMP] Fix partial warp reduction

Don't use a full warp for the final cross-team reduction if the
reduction's thread limit is below the warp size.

Claude assisted with the test.
---
 .../xteam_min_reduction_partial_wave.c        | 41 +++++++++++++++++++
 openmp/device/src/Reduction.cpp               |  2 +-
 2 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 offload/test/offloading/xteam_min_reduction_partial_wave.c

diff --git a/offload/test/offloading/xteam_min_reduction_partial_wave.c b/offload/test/offloading/xteam_min_reduction_partial_wave.c
new file mode 100644
index 0000000000000..df2b4e828288a
--- /dev/null
+++ b/offload/test/offloading/xteam_min_reduction_partial_wave.c
@@ -0,0 +1,41 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compileopt-run-and-check-generic
+
+// REQUIRES: gpu
+// UNSUPPORTED: intelgpu
+
+// Regression test for a cross-team reduction bug where the final xteam
+// reduction used a full-wave reduction path even though the kernel was launched
+// with fewer threads than the device wave size.  On wave64 AMDGPU targets,
+// thread_limit(32) creates a partial wave.  The 63-team case uses the
+// single-thread final reduction path; the 64-team case crosses the wave-size
+// boundary and must still ignore inactive lanes.
+
+#include <limits.h>
+#include <stdio.h>
+
+static unsigned reduce_min(int teams, int seed) {
+  unsigned min_val = UINT_MAX;
+
+#pragma omp target teams distribute parallel for num_teams(teams)              \
+    thread_limit(32) map(to : seed) reduction(min : min_val)
+  for (int i = 0; i < 2017; ++i) {
+    unsigned val = 0xdeadbeefU + ((i + seed) & 1);
+    if (val < min_val)
+      min_val = val;
+  }
+
+  return min_val;
+}
+
+int main(int argc, char **argv) {
+  unsigned min63 = reduce_min(63, argc);
+  unsigned min64 = reduce_min(64, argc);
+
+  // CHECK: min63 = 0xdeadbeef
+  // CHECK: min64 = 0xdeadbeef
+  printf("min63 = %#x\n", min63);
+  printf("min64 = %#x\n", min64);
+
+  return min63 == 0xdeadbeefU && min64 == 0xdeadbeefU ? 0 : 1;
+}
diff --git a/openmp/device/src/Reduction.cpp b/openmp/device/src/Reduction.cpp
index ec772d357a425..df444fb9d90e4 100644
--- a/openmp/device/src/Reduction.cpp
+++ b/openmp/device/src/Reduction.cpp
@@ -280,7 +280,7 @@ int32_t __kmpc_gpu_xteam_reduce_nowait(IdentTy *Loc, void *reduce_data,
     return 0;
 
   // The last team performs final reduction across all team values.
-  NumThreads = kmpc_min(NumThreads, round_down_to_warpsize(NumTeams));
+  NumThreads = round_down_to_warpsize(kmpc_min(NumThreads, NumTeams));
   if (ThreadId >= NumThreads)
     return 0;
 

>From f775aed0b30cf381793e9d3c0f45f601427ab03a Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 25 Jun 2026 13:35:10 -0500
Subject: [PATCH 2/3] make test clearer and a bit better

---
 .../xteam_min_reduction_partial_wave.c        | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/offload/test/offloading/xteam_min_reduction_partial_wave.c b/offload/test/offloading/xteam_min_reduction_partial_wave.c
index df2b4e828288a..f6322d1b13e1e 100644
--- a/offload/test/offloading/xteam_min_reduction_partial_wave.c
+++ b/offload/test/offloading/xteam_min_reduction_partial_wave.c
@@ -1,8 +1,9 @@
-// RUN: %libomptarget-compile-run-and-check-generic
-// RUN: %libomptarget-compileopt-run-and-check-generic
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: %libomptarget-compileopt-generic
+// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
 
-// REQUIRES: gpu
-// UNSUPPORTED: intelgpu
+// REQUIRES: amdgpu
 
 // Regression test for a cross-team reduction bug where the final xteam
 // reduction used a full-wave reduction path even though the kernel was launched
@@ -14,13 +15,19 @@
 #include <limits.h>
 #include <stdio.h>
 
-static unsigned reduce_min(int teams, int seed) {
+#define THREAD_LIMIT 32
+#define TEAMS_BELOW_WAVE_SIZE_BOUNDARY 63
+#define TEAMS_AT_WAVE_SIZE_BOUNDARY 64
+#define NUM_ITERS (THREAD_LIMIT * TEAMS_BELOW_WAVE_SIZE_BOUNDARY + 1)
+#define EXPECTED_MIN UINT_MAX
+
+static unsigned reduce_min(int teams) {
   unsigned min_val = UINT_MAX;
 
 #pragma omp target teams distribute parallel for num_teams(teams)              \
-    thread_limit(32) map(to : seed) reduction(min : min_val)
-  for (int i = 0; i < 2017; ++i) {
-    unsigned val = 0xdeadbeefU + ((i + seed) & 1);
+    thread_limit(THREAD_LIMIT) reduction(min : min_val)
+  for (int i = 0; i < NUM_ITERS; ++i) {
+    unsigned val = EXPECTED_MIN;
     if (val < min_val)
       min_val = val;
   }
@@ -28,14 +35,16 @@ static unsigned reduce_min(int teams, int seed) {
   return min_val;
 }
 
-int main(int argc, char **argv) {
-  unsigned min63 = reduce_min(63, argc);
-  unsigned min64 = reduce_min(64, argc);
+int main(void) {
+  unsigned min63 = reduce_min(TEAMS_BELOW_WAVE_SIZE_BOUNDARY);
+  unsigned min64 = reduce_min(TEAMS_AT_WAVE_SIZE_BOUNDARY);
 
-  // CHECK: min63 = 0xdeadbeef
-  // CHECK: min64 = 0xdeadbeef
+  // CHECK: Launching kernel {{.*}} with [63,1,1] blocks and [32,1,1] threads in SPMD mode
+  // CHECK: Launching kernel {{.*}} with [64,1,1] blocks and [32,1,1] threads in SPMD mode
+  // CHECK: min63 = 0xffffffff
+  // CHECK: min64 = 0xffffffff
   printf("min63 = %#x\n", min63);
   printf("min64 = %#x\n", min64);
 
-  return min63 == 0xdeadbeefU && min64 == 0xdeadbeefU ? 0 : 1;
+  return min63 == EXPECTED_MIN && min64 == EXPECTED_MIN ? 0 : 1;
 }

>From 81b21deb507c6e1519e0548a8c0f5c2fd708c4e5 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 25 Jun 2026 13:45:17 -0500
Subject: [PATCH 3/3] fix false positive clang-format suggestions

---
 .../offloading/xteam_min_reduction_partial_wave.c    | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/offload/test/offloading/xteam_min_reduction_partial_wave.c b/offload/test/offloading/xteam_min_reduction_partial_wave.c
index f6322d1b13e1e..11c45c7726ce1 100644
--- a/offload/test/offloading/xteam_min_reduction_partial_wave.c
+++ b/offload/test/offloading/xteam_min_reduction_partial_wave.c
@@ -1,7 +1,9 @@
 // RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileopt-generic
-// RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
 
 // REQUIRES: amdgpu
 
@@ -39,8 +41,10 @@ int main(void) {
   unsigned min63 = reduce_min(TEAMS_BELOW_WAVE_SIZE_BOUNDARY);
   unsigned min64 = reduce_min(TEAMS_AT_WAVE_SIZE_BOUNDARY);
 
-  // CHECK: Launching kernel {{.*}} with [63,1,1] blocks and [32,1,1] threads in SPMD mode
-  // CHECK: Launching kernel {{.*}} with [64,1,1] blocks and [32,1,1] threads in SPMD mode
+  // CHECK: Launching kernel {{.*}} with [63,1,1] blocks and [32,1,1] threads
+  // CHECK-SAME: in SPMD mode
+  // CHECK: Launching kernel {{.*}} with [64,1,1] blocks and [32,1,1] threads
+  // CHECK-SAME: in SPMD mode
   // CHECK: min63 = 0xffffffff
   // CHECK: min64 = 0xffffffff
   printf("min63 = %#x\n", min63);



More information about the Openmp-commits mailing list