[Openmp-commits] [openmp] [Offload][OpenMP][libdevice] Make check to enter state machine architecture dependent (PR #188144)

Wed Mar 25 01:21:01 PDT 2026

https://github.com/adurang updated https://github.com/llvm/llvm-project/pull/188144

>From 7d4a1ea27654a576f64ad594359f406d5482528c Mon Sep 17 00:00:00 2001
From: "Duran, Alex" <alejandro.duran at intel.com>
Date: Mon, 23 Mar 2026 07:46:13 -0700
Subject: [PATCH 1/2] [Offload][OpenMP][libdevice] Make check to enter state
 machine architecture dependent

---
 openmp/device/include/Configuration.h | 39 +++++++++++++++++++++++++++
 openmp/device/src/Kernel.cpp          | 19 +++----------
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/openmp/device/include/Configuration.h b/openmp/device/include/Configuration.h
index 95408933dd865..8366c78da1030 100644
--- a/openmp/device/include/Configuration.h
+++ b/openmp/device/include/Configuration.h
@@ -62,6 +62,45 @@ bool mayUseThreadStates();
 /// parallelism, or if it was explicitly disabled by the user.
 bool mayUseNestedParallelism();
 
+/// Returns true if the current thread should enter the generic state machine.
+/// On some architectures, some threads should not enter the state machine to
+/// avoid warp-level barrier forwarding issues during initialization.
+/// On other architectures, all threads must enter the state machine to satisfy
+/// the requirements of workgroup synchronization.
+static inline bool shouldEnterStateMachine(bool IsSPMD);
+
+} // namespace config
+} // namespace ompx
+
+#include "Mapping.h"
+
+namespace ompx {
+namespace config {
+
+static inline bool shouldEnterStateMachine(bool IsSPMD) {
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+  // This check is important for NVIDIA Pascal (but not Volta) and AMD
+  // GPU. In those cases, a single thread can apparently satisfy a barrier on
+  // behalf of all threads in the same warp. Thus, it would not be safe for
+  // other threads in the main thread's warp to reach the first
+  // synchronize::threads call in genericStateMachine before the main thread
+  // reaches its corresponding synchronize::threads call: that would permit all
+  // active worker threads to proceed before the main thread has actually set
+  // state::ParallelRegionFn, and then they would immediately quit without
+  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
+  // main thread's warp, so none of its threads can ever be active worker
+  // threads.
+  return mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD);
+#else
+  // On other architectures (e.g., Intel GPUs) all threads must enter the state
+  // machine to satisfy the requirements of workgroup of synchronize::threads
+  // call in genericStateMachine. Otherwise, the workers will wait on the
+  // call to synchronize::threads forever and never proceed.
+  (void)IsSPMD;
+  return true;
+#endif
+}
+
 } // namespace config
 } // namespace ompx
 
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index a180df7b982e3..40474f54b2d61 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -108,21 +108,10 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
     return -1;
 
   // Enter the generic state machine if enabled and if this thread can possibly
-  // be an active worker thread.
-  //
-  // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
-  // GPU.  In those cases, a single thread can apparently satisfy a barrier on
-  // behalf of all threads in the same warp.  Thus, it would not be safe for
-  // other threads in the main thread's warp to reach the first
-  // synchronize::threads call in genericStateMachine before the main thread
-  // reaches its corresponding synchronize::threads call: that would permit all
-  // active worker threads to proceed before the main thread has actually set
-  // state::ParallelRegionFn, and then they would immediately quit without
-  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
-  // main thread's warp, so none of its threads can ever be active worker
-  // threads.
-  if (UseGenericStateMachine &&
-      mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
+  // be an active worker thread. The shouldEnterStateMachine check is
+  // architecture-specific and handles platforms where warp-level barrier
+  // forwarding could cause races during state machine initialization.
+  if (UseGenericStateMachine && config::shouldEnterStateMachine(IsSPMD))
     genericStateMachine(KernelEnvironment.Ident);
 
   return mapping::getThreadIdInBlock();

>From 4b674cc1516c4eaf3703f3de85b16be116a0b5b9 Mon Sep 17 00:00:00 2001
From: "Duran, Alex" <alejandro.duran at intel.com>
Date: Wed, 25 Mar 2026 01:20:19 -0700
Subject: [PATCH 2/2] move code around

---
 openmp/device/include/Configuration.h | 39 ---------------------------
 openmp/device/src/Kernel.cpp          | 27 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/openmp/device/include/Configuration.h b/openmp/device/include/Configuration.h
index 8366c78da1030..95408933dd865 100644
--- a/openmp/device/include/Configuration.h
+++ b/openmp/device/include/Configuration.h
@@ -62,45 +62,6 @@ bool mayUseThreadStates();
 /// parallelism, or if it was explicitly disabled by the user.
 bool mayUseNestedParallelism();
 
-/// Returns true if the current thread should enter the generic state machine.
-/// On some architectures, some threads should not enter the state machine to
-/// avoid warp-level barrier forwarding issues during initialization.
-/// On other architectures, all threads must enter the state machine to satisfy
-/// the requirements of workgroup synchronization.
-static inline bool shouldEnterStateMachine(bool IsSPMD);
-
-} // namespace config
-} // namespace ompx
-
-#include "Mapping.h"
-
-namespace ompx {
-namespace config {
-
-static inline bool shouldEnterStateMachine(bool IsSPMD) {
-#if defined(__NVPTX__) || defined(__AMDGPU__)
-  // This check is important for NVIDIA Pascal (but not Volta) and AMD
-  // GPU. In those cases, a single thread can apparently satisfy a barrier on
-  // behalf of all threads in the same warp. Thus, it would not be safe for
-  // other threads in the main thread's warp to reach the first
-  // synchronize::threads call in genericStateMachine before the main thread
-  // reaches its corresponding synchronize::threads call: that would permit all
-  // active worker threads to proceed before the main thread has actually set
-  // state::ParallelRegionFn, and then they would immediately quit without
-  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
-  // main thread's warp, so none of its threads can ever be active worker
-  // threads.
-  return mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD);
-#else
-  // On other architectures (e.g., Intel GPUs) all threads must enter the state
-  // machine to satisfy the requirements of workgroup of synchronize::threads
-  // call in genericStateMachine. Otherwise, the workers will wait on the
-  // call to synchronize::threads forever and never proceed.
-  (void)IsSPMD;
-  return true;
-#endif
-}
-
 } // namespace config
 } // namespace ompx
 
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index 40474f54b2d61..d6b8659436156 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -44,6 +44,31 @@ initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
   workshare::init(IsSPMD);
 }
 
+/// Returns true if the current thread should enter the generic state machine.
+static bool shouldEnterStateMachine(bool IsSPMD) {
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+  // This check is important for NVIDIA Pascal (but not Volta) and AMD
+  // GPU. In those cases, a single thread can apparently satisfy a barrier on
+  // behalf of all threads in the same warp. Thus, it would not be safe for
+  // other threads in the main thread's warp to reach the first
+  // synchronize::threads call in genericStateMachine before the main thread
+  // reaches its corresponding synchronize::threads call: that would permit all
+  // active worker threads to proceed before the main thread has actually set
+  // state::ParallelRegionFn, and then they would immediately quit without
+  // doing any work.  mapping::getMaxTeamThreads() does not include any of the
+  // main thread's warp, so none of its threads can ever be active worker
+  // threads.
+  return mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD);
+#else
+  // On other architectures (e.g., Intel GPUs) all threads must enter the state
+  // machine to satisfy the requirements of workgroup of synchronize::threads
+  // call in genericStateMachine. Otherwise, the workers will wait on the
+  // call to synchronize::threads forever and never proceed.
+  (void)IsSPMD;
+  return true;
+#endif
+}
+
 /// Simple generic state machine for worker threads.
 static void genericStateMachine(IdentTy *Ident) {
   uint32_t TId = mapping::getThreadIdInBlock();
@@ -111,7 +136,7 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
   // be an active worker thread. The shouldEnterStateMachine check is
   // architecture-specific and handles platforms where warp-level barrier
   // forwarding could cause races during state machine initialization.
-  if (UseGenericStateMachine && config::shouldEnterStateMachine(IsSPMD))
+  if (UseGenericStateMachine && shouldEnterStateMachine(IsSPMD))
     genericStateMachine(KernelEnvironment.Ident);
 
   return mapping::getThreadIdInBlock();