[Openmp-commits] [openmp] [OpenMP] Fix work-stealing stack clobber with taskwait (PR #126049)

Julian Brown via Openmp-commits openmp-commits at lists.llvm.org
Thu Feb 6 03:23:36 PST 2025


https://github.com/jtb20 created https://github.com/llvm/llvm-project/pull/126049

This patch series demonstrates and fixes a bug that causes crashes with OpenMP 'taskwait' directives in heavily multi-threaded scenarios.

The implementation of taskwait builds a graph of dependency nodes for tasks. Some of those dependency nodes (kmp_depnode_t) are allocated on the stack, and some on the heap. In the former case, the stack is specific to a given thread, and the task associated with the node is initially bound to the same thread. This works as long as there is a 1:1 mapping between tasks and the per-thread stack.

However, kmp_tasking.cpp:__kmp_execute_tasks_template implements a work-stealing algorithm that can take some task 'T1' from some thread's ready queue (say, thread 'A'), and execute them on another thread (say, thread 'B').

If that happens, task T1 may have a dependency node on thread A's stack, and that will **not** be moved to thread B's stack when the work-stealing takes place.

Now, in a heavily multi-threaded program, **another** task, T2, can be invoked on thread 'A', re-using the stack slot for thread A at the same time that T1 is using the same slot from thread 'B'.  This leads to random crashes, often (but not always) during dependency-node cleanup (__kmp_release_deps).

>From 812b49251ed61d079ff47dfd5a65fd926d09e16a Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 5 Feb 2025 10:52:35 -0600
Subject: [PATCH 1/3] [OpenMP] Fix work-stealing stack clobber with taskwait

This patch series demonstrates and fixes a bug that causes crashes with
OpenMP 'taskwait' directives in heavily multi-threaded scenarios.

The implementation of taskwait builds a graph of dependency nodes for
tasks. Some of those dependency nodes (kmp_depnode_t) are allocated
on the stack, and some on the heap. In the former case, the stack is
specific to a given thread, and the task associated with the node is
initially bound to the same thread. This works as long as there is a
1:1 mapping between tasks and the per-thread stack.

However, kmp_tasking.cpp:__kmp_execute_tasks_template implements a
work-stealing algorithm that can take some task 'T1' from some thread's
ready queue (say, thread 'A'), and execute them on another thread (say,
thread 'B').

If that happens, task T1 may have a dependency node on thread A's stack,
and that will *not* be moved to thread B's stack when the work-stealing
takes place.

Now, in a heavily multi-threaded program, *another* task, T2, can be
invoked on thread 'A', re-using the stack slot for thread A at the same
time that T1 is using the same slot from thread 'B'.  This leads to
random crashes, often (but not always) during dependency-node cleanup
(__kmp_release_deps).

This first patch adds some instrumentation to make it more obvious when
the 1:1 mapping between tasks and thread stacks is violated. Typical
output will be (heavily truncated):

on-stack depnode moved from thread 0x5631d7d5c200 to thread 0x5631d7e15c00
Assertion failure at kmp_taskdeps.h(37): !node->dn.on_stack.

Adding debug output also affects the timing such that the bug shows up
much more frequently, at least on the system I'm working on.
---
 openmp/runtime/src/kmp.h            |  3 +++
 openmp/runtime/src/kmp_taskdeps.cpp | 10 +++++++++-
 openmp/runtime/src/kmp_taskdeps.h   |  7 +++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 04bf6c3b34dace2..efc07a4733d73b8 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2556,6 +2556,9 @@ typedef struct kmp_base_depnode {
 #endif
   std::atomic<kmp_int32> npredecessors;
   std::atomic<kmp_int32> nrefs;
+#if KMP_DEBUG
+  void *on_stack;
+#endif
 } kmp_base_depnode_t;
 
 union KMP_ALIGN_CACHE kmp_depnode {
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index 39cf3496c5a1855..fa8eac3ec31a194 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -48,6 +48,9 @@ static void __kmp_init_node(kmp_depnode_t *node) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
   __itt_sync_create(node, "OMP task dep node", NULL, 0);
 #endif
+#if KMP_DEBUG
+  node->dn.on_stack = NULL;
+#endif
 }
 
 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
@@ -1008,6 +1011,9 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
 
   kmp_depnode_t node = {0};
   __kmp_init_node(&node);
+#if KMP_DEBUG
+  node.dn.on_stack = thread;
+#endif
 
   if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
@@ -1033,8 +1039,10 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
   // Wait until the last __kmp_release_deps is finished before we free the
   // current stack frame holding the "node" variable; once its nrefs count
   // reaches 1, we're sure nobody else can try to reference it again.
-  while (node.dn.nrefs > 1)
+  kmp_int32 nrefs;
+  while ((nrefs = node.dn.nrefs) > 1)
     KMP_YIELD(TRUE);
+  KMP_DEBUG_ASSERT(nrefs == 1);
 
 #if OMPT_SUPPORT
   __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index d2ab515158011a1..1d68eea45923bed 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -22,12 +22,19 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
   if (!node)
     return;
 
+#if KMP_DEBUG
+  if (node->dn.on_stack && node->dn.on_stack != thread)
+    fprintf (stderr, "on-stack depnode moved from thread %p to thread %p\n",
+             node->dn.on_stack, thread);
+#endif
+
   kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
   KMP_DEBUG_ASSERT(n >= 0);
   if (n == 0) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     __itt_sync_destroy(node);
 #endif
+    KMP_DEBUG_ASSERT(!node->dn.on_stack);
     KMP_ASSERT(node->dn.nrefs == 0);
 #if USE_FAST_MEMORY
     __kmp_fast_free(thread, node);

>From 3a89d1575c77db630c565f8c0c5e54e129f46e5b Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 5 Feb 2025 10:48:45 -0600
Subject: [PATCH 2/3] [OpenMP] Allocate depnodes on heap not stack in
 __kmpc_omp_taskwait_deps_51

This patch contains the fix for the work-stealing stack clobber with
task graph dependency nodes. A straightforward-seeming fix is to just
use heap allocation instead of stack allocation for the problematic
nodes in question.  This also allows us to revert the fix from PR85963:

  https://github.com/llvm/llvm-project/issues/85963

There may be some concern that despite fixing the issue, using heap rather
than stack allocation may have a negative performance impact. However
this does not appear to be the case: in fact I measured a small
performance boost with this patch, i.e. over 100 runs of the OpenMP_VV
test_taskwait_depend.c test with the iteration count N increased to
1024000, five times before & after the patch is applied:

pre-patch
=========

  real    0m30.635s
  user    45m55.276s
  sys     1m24.537s

  real    0m30.943s
  user    47m5.018s
  sys     1m19.218s

  real    0m30.573s
  user    45m56.023s
  sys     1m21.273s

  real    0m30.955s
  user    47m6.567s
  sys     1m25.121s

  real    0m30.712s
  user    46m51.830s
  sys     1m22.085s

with patch
==========

  real    0m28.245s
  user    39m16.057s
  sys     1m11.156s

  real    0m28.385s
  user    39m35.553s
  sys     1m16.413s

  real    0m28.421s
  user    39m42.498s
  sys     1m16.783s

  real    0m29.207s
  user    39m5.006s
  sys     1m10.247s

  real    0m29.118s
  user    38m30.484s
  sys     1m14.027s

(On an AMD system with 64 cores/256 hw threads.)

This is probably because we no longer need the wait loop in
__kmpc_omp_taskwait_deps_51.
---
 openmp/runtime/src/kmp_taskdeps.cpp | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index fa8eac3ec31a194..d59ab077fe67bc3 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -1009,13 +1009,16 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
     return;
   }
 
-  kmp_depnode_t node = {0};
-  __kmp_init_node(&node);
-#if KMP_DEBUG
-  node.dn.on_stack = thread;
+#if USE_FAST_MEMORY
+    kmp_depnode_t *node =
+        (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
+#else
+    kmp_depnode_t *node =
+        (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
 #endif
+  __kmp_init_node(node);
 
-  if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
+  if (!__kmp_check_deps(gtid, node, NULL, &current_task->td_dephash,
                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
                         noalias_dep_list)) {
     KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
@@ -1029,20 +1032,14 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
 
   int thread_finished = FALSE;
   kmp_flag_32<false, false> flag(
-      (std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
-  while (node.dn.npredecessors > 0) {
+      (std::atomic<kmp_uint32> *)&node->dn.npredecessors, 0U);
+  while (node->dn.npredecessors > 0) {
     flag.execute_tasks(thread, gtid, FALSE,
                        &thread_finished USE_ITT_BUILD_ARG(NULL),
                        __kmp_task_stealing_constraint);
   }
 
-  // Wait until the last __kmp_release_deps is finished before we free the
-  // current stack frame holding the "node" variable; once its nrefs count
-  // reaches 1, we're sure nobody else can try to reference it again.
-  kmp_int32 nrefs;
-  while ((nrefs = node.dn.nrefs) > 1)
-    KMP_YIELD(TRUE);
-  KMP_DEBUG_ASSERT(nrefs == 1);
+  __kmp_node_deref(thread, node);
 
 #if OMPT_SUPPORT
   __ompt_taskwait_dep_finish(current_task, taskwait_task_data);

>From 5b0e5b72d2e21f11b4090655a2f5a38f1a2594d4 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 5 Feb 2025 14:07:21 -0600
Subject: [PATCH 3/3] [OpenMP] Clean up taskwait bug demonstration code

This patch removes the extra instrumentation to demonstrate the taskwait
bug described in previous patches.  All three patches can be committed
together to reduce churn.
---
 openmp/runtime/src/kmp.h            | 3 ---
 openmp/runtime/src/kmp_taskdeps.cpp | 3 ---
 openmp/runtime/src/kmp_taskdeps.h   | 7 -------
 3 files changed, 13 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index efc07a4733d73b8..04bf6c3b34dace2 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2556,9 +2556,6 @@ typedef struct kmp_base_depnode {
 #endif
   std::atomic<kmp_int32> npredecessors;
   std::atomic<kmp_int32> nrefs;
-#if KMP_DEBUG
-  void *on_stack;
-#endif
 } kmp_base_depnode_t;
 
 union KMP_ALIGN_CACHE kmp_depnode {
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index d59ab077fe67bc3..90deb91afe7cbf0 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -48,9 +48,6 @@ static void __kmp_init_node(kmp_depnode_t *node) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
   __itt_sync_create(node, "OMP task dep node", NULL, 0);
 #endif
-#if KMP_DEBUG
-  node->dn.on_stack = NULL;
-#endif
 }
 
 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index 1d68eea45923bed..d2ab515158011a1 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -22,19 +22,12 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
   if (!node)
     return;
 
-#if KMP_DEBUG
-  if (node->dn.on_stack && node->dn.on_stack != thread)
-    fprintf (stderr, "on-stack depnode moved from thread %p to thread %p\n",
-             node->dn.on_stack, thread);
-#endif
-
   kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
   KMP_DEBUG_ASSERT(n >= 0);
   if (n == 0) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
     __itt_sync_destroy(node);
 #endif
-    KMP_DEBUG_ASSERT(!node->dn.on_stack);
     KMP_ASSERT(node->dn.nrefs == 0);
 #if USE_FAST_MEMORY
     __kmp_fast_free(thread, node);



More information about the Openmp-commits mailing list