[Openmp-commits] [openmp] [Offload][OpenMP][libdevice] Make check to enter state machine architecture dependent (PR #188144)
Alex Duran via Openmp-commits
openmp-commits at lists.llvm.org
Wed Mar 25 01:21:01 PDT 2026
https://github.com/adurang updated https://github.com/llvm/llvm-project/pull/188144
>From 7d4a1ea27654a576f64ad594359f406d5482528c Mon Sep 17 00:00:00 2001
From: "Duran, Alex" <alejandro.duran at intel.com>
Date: Mon, 23 Mar 2026 07:46:13 -0700
Subject: [PATCH 1/2] [Offload][OpenMP][libdevice] Make check to enter state
machine architecture dependent
---
openmp/device/include/Configuration.h | 39 +++++++++++++++++++++++++++
openmp/device/src/Kernel.cpp | 19 +++----------
2 files changed, 43 insertions(+), 15 deletions(-)
diff --git a/openmp/device/include/Configuration.h b/openmp/device/include/Configuration.h
index 95408933dd865..8366c78da1030 100644
--- a/openmp/device/include/Configuration.h
+++ b/openmp/device/include/Configuration.h
@@ -62,6 +62,45 @@ bool mayUseThreadStates();
/// parallelism, or if it was explicitly disabled by the user.
bool mayUseNestedParallelism();
+/// Returns true if the current thread should enter the generic state machine.
+/// On some architectures, some threads should not enter the state machine to
+/// avoid warp-level barrier forwarding issues during initialization.
+/// On other architectures, all threads must enter the state machine to satisfy
+/// the requirements of workgroup synchronization.
+static inline bool shouldEnterStateMachine(bool IsSPMD);
+
+} // namespace config
+} // namespace ompx
+
+#include "Mapping.h"
+
+namespace ompx {
+namespace config {
+
+static inline bool shouldEnterStateMachine(bool IsSPMD) {
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+ // This check is important for NVIDIA Pascal (but not Volta) and AMD
+ // GPU. In those cases, a single thread can apparently satisfy a barrier on
+ // behalf of all threads in the same warp. Thus, it would not be safe for
+ // other threads in the main thread's warp to reach the first
+ // synchronize::threads call in genericStateMachine before the main thread
+ // reaches its corresponding synchronize::threads call: that would permit all
+ // active worker threads to proceed before the main thread has actually set
+ // state::ParallelRegionFn, and then they would immediately quit without
+ // doing any work. mapping::getMaxTeamThreads() does not include any of the
+ // main thread's warp, so none of its threads can ever be active worker
+ // threads.
+ return mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD);
+#else
+ // On other architectures (e.g., Intel GPUs) all threads must enter the state
+ // machine to satisfy the requirements of workgroup of synchronize::threads
+ // call in genericStateMachine. Otherwise, the workers will wait on the
+ // call to synchronize::threads forever and never proceed.
+ (void)IsSPMD;
+ return true;
+#endif
+}
+
} // namespace config
} // namespace ompx
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index a180df7b982e3..40474f54b2d61 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -108,21 +108,10 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
return -1;
// Enter the generic state machine if enabled and if this thread can possibly
- // be an active worker thread.
- //
- // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
- // GPU. In those cases, a single thread can apparently satisfy a barrier on
- // behalf of all threads in the same warp. Thus, it would not be safe for
- // other threads in the main thread's warp to reach the first
- // synchronize::threads call in genericStateMachine before the main thread
- // reaches its corresponding synchronize::threads call: that would permit all
- // active worker threads to proceed before the main thread has actually set
- // state::ParallelRegionFn, and then they would immediately quit without
- // doing any work. mapping::getMaxTeamThreads() does not include any of the
- // main thread's warp, so none of its threads can ever be active worker
- // threads.
- if (UseGenericStateMachine &&
- mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
+ // be an active worker thread. The shouldEnterStateMachine check is
+ // architecture-specific and handles platforms where warp-level barrier
+ // forwarding could cause races during state machine initialization.
+ if (UseGenericStateMachine && config::shouldEnterStateMachine(IsSPMD))
genericStateMachine(KernelEnvironment.Ident);
return mapping::getThreadIdInBlock();
>From 4b674cc1516c4eaf3703f3de85b16be116a0b5b9 Mon Sep 17 00:00:00 2001
From: "Duran, Alex" <alejandro.duran at intel.com>
Date: Wed, 25 Mar 2026 01:20:19 -0700
Subject: [PATCH 2/2] move code around
---
openmp/device/include/Configuration.h | 39 ---------------------------
openmp/device/src/Kernel.cpp | 27 ++++++++++++++++++-
2 files changed, 26 insertions(+), 40 deletions(-)
diff --git a/openmp/device/include/Configuration.h b/openmp/device/include/Configuration.h
index 8366c78da1030..95408933dd865 100644
--- a/openmp/device/include/Configuration.h
+++ b/openmp/device/include/Configuration.h
@@ -62,45 +62,6 @@ bool mayUseThreadStates();
/// parallelism, or if it was explicitly disabled by the user.
bool mayUseNestedParallelism();
-/// Returns true if the current thread should enter the generic state machine.
-/// On some architectures, some threads should not enter the state machine to
-/// avoid warp-level barrier forwarding issues during initialization.
-/// On other architectures, all threads must enter the state machine to satisfy
-/// the requirements of workgroup synchronization.
-static inline bool shouldEnterStateMachine(bool IsSPMD);
-
-} // namespace config
-} // namespace ompx
-
-#include "Mapping.h"
-
-namespace ompx {
-namespace config {
-
-static inline bool shouldEnterStateMachine(bool IsSPMD) {
-#if defined(__NVPTX__) || defined(__AMDGPU__)
- // This check is important for NVIDIA Pascal (but not Volta) and AMD
- // GPU. In those cases, a single thread can apparently satisfy a barrier on
- // behalf of all threads in the same warp. Thus, it would not be safe for
- // other threads in the main thread's warp to reach the first
- // synchronize::threads call in genericStateMachine before the main thread
- // reaches its corresponding synchronize::threads call: that would permit all
- // active worker threads to proceed before the main thread has actually set
- // state::ParallelRegionFn, and then they would immediately quit without
- // doing any work. mapping::getMaxTeamThreads() does not include any of the
- // main thread's warp, so none of its threads can ever be active worker
- // threads.
- return mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD);
-#else
- // On other architectures (e.g., Intel GPUs) all threads must enter the state
- // machine to satisfy the requirements of workgroup of synchronize::threads
- // call in genericStateMachine. Otherwise, the workers will wait on the
- // call to synchronize::threads forever and never proceed.
- (void)IsSPMD;
- return true;
-#endif
-}
-
} // namespace config
} // namespace ompx
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index 40474f54b2d61..d6b8659436156 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -44,6 +44,31 @@ initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
workshare::init(IsSPMD);
}
+/// Returns true if the current thread should enter the generic state machine.
+static bool shouldEnterStateMachine(bool IsSPMD) {
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+ // This check is important for NVIDIA Pascal (but not Volta) and AMD
+ // GPU. In those cases, a single thread can apparently satisfy a barrier on
+ // behalf of all threads in the same warp. Thus, it would not be safe for
+ // other threads in the main thread's warp to reach the first
+ // synchronize::threads call in genericStateMachine before the main thread
+ // reaches its corresponding synchronize::threads call: that would permit all
+ // active worker threads to proceed before the main thread has actually set
+ // state::ParallelRegionFn, and then they would immediately quit without
+ // doing any work. mapping::getMaxTeamThreads() does not include any of the
+ // main thread's warp, so none of its threads can ever be active worker
+ // threads.
+ return mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD);
+#else
+ // On other architectures (e.g., Intel GPUs) all threads must enter the state
+ // machine to satisfy the requirements of workgroup of synchronize::threads
+ // call in genericStateMachine. Otherwise, the workers will wait on the
+ // call to synchronize::threads forever and never proceed.
+ (void)IsSPMD;
+ return true;
+#endif
+}
+
/// Simple generic state machine for worker threads.
static void genericStateMachine(IdentTy *Ident) {
uint32_t TId = mapping::getThreadIdInBlock();
@@ -111,7 +136,7 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
// be an active worker thread. The shouldEnterStateMachine check is
// architecture-specific and handles platforms where warp-level barrier
// forwarding could cause races during state machine initialization.
- if (UseGenericStateMachine && config::shouldEnterStateMachine(IsSPMD))
+ if (UseGenericStateMachine && shouldEnterStateMachine(IsSPMD))
genericStateMachine(KernelEnvironment.Ident);
return mapping::getThreadIdInBlock();
More information about the Openmp-commits
mailing list