[Openmp-commits] [openmp] [OpenMP] Fix work-stealing stack clobber with taskwait (PR #126049)
Julian Brown via Openmp-commits
openmp-commits at lists.llvm.org
Thu Feb 6 03:23:36 PST 2025
https://github.com/jtb20 created https://github.com/llvm/llvm-project/pull/126049
This patch series demonstrates and fixes a bug that causes crashes with OpenMP 'taskwait' directives in heavily multi-threaded scenarios.
The implementation of taskwait builds a graph of dependency nodes for tasks. Some of those dependency nodes (kmp_depnode_t) are allocated on the stack, and some on the heap. In the former case, the stack is specific to a given thread, and the task associated with the node is initially bound to the same thread. This works as long as there is a 1:1 mapping between tasks and the per-thread stack.
However, kmp_tasking.cpp:__kmp_execute_tasks_template implements a work-stealing algorithm that can take some task 'T1' from some thread's ready queue (say, thread 'A'), and execute them on another thread (say, thread 'B').
If that happens, task T1 may have a dependency node on thread A's stack, and that will **not** be moved to thread B's stack when the work-stealing takes place.
Now, in a heavily multi-threaded program, **another** task, T2, can be invoked on thread 'A', re-using the stack slot for thread A at the same time that T1 is using the same slot from thread 'B'. This leads to random crashes, often (but not always) during dependency-node cleanup (__kmp_release_deps).
>From 812b49251ed61d079ff47dfd5a65fd926d09e16a Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 5 Feb 2025 10:52:35 -0600
Subject: [PATCH 1/3] [OpenMP] Fix work-stealing stack clobber with taskwait
This patch series demonstrates and fixes a bug that causes crashes with
OpenMP 'taskwait' directives in heavily multi-threaded scenarios.
The implementation of taskwait builds a graph of dependency nodes for
tasks. Some of those dependency nodes (kmp_depnode_t) are allocated
on the stack, and some on the heap. In the former case, the stack is
specific to a given thread, and the task associated with the node is
initially bound to the same thread. This works as long as there is a
1:1 mapping between tasks and the per-thread stack.
However, kmp_tasking.cpp:__kmp_execute_tasks_template implements a
work-stealing algorithm that can take some task 'T1' from some thread's
ready queue (say, thread 'A'), and execute them on another thread (say,
thread 'B').
If that happens, task T1 may have a dependency node on thread A's stack,
and that will *not* be moved to thread B's stack when the work-stealing
takes place.
Now, in a heavily multi-threaded program, *another* task, T2, can be
invoked on thread 'A', re-using the stack slot for thread A at the same
time that T1 is using the same slot from thread 'B'. This leads to
random crashes, often (but not always) during dependency-node cleanup
(__kmp_release_deps).
This first patch adds some instrumentation to make it more obvious when
the 1:1 mapping between tasks and thread stacks is violated. Typical
output will be (heavily truncated):
on-stack depnode moved from thread 0x5631d7d5c200 to thread 0x5631d7e15c00
Assertion failure at kmp_taskdeps.h(37): !node->dn.on_stack.
Adding debug output also affects the timing such that the bug shows up
much more frequently, at least on the system I'm working on.
---
openmp/runtime/src/kmp.h | 3 +++
openmp/runtime/src/kmp_taskdeps.cpp | 10 +++++++++-
openmp/runtime/src/kmp_taskdeps.h | 7 +++++++
3 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 04bf6c3b34dace2..efc07a4733d73b8 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2556,6 +2556,9 @@ typedef struct kmp_base_depnode {
#endif
std::atomic<kmp_int32> npredecessors;
std::atomic<kmp_int32> nrefs;
+#if KMP_DEBUG
+ void *on_stack;
+#endif
} kmp_base_depnode_t;
union KMP_ALIGN_CACHE kmp_depnode {
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index 39cf3496c5a1855..fa8eac3ec31a194 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -48,6 +48,9 @@ static void __kmp_init_node(kmp_depnode_t *node) {
#if USE_ITT_BUILD && USE_ITT_NOTIFY
__itt_sync_create(node, "OMP task dep node", NULL, 0);
#endif
+#if KMP_DEBUG
+ node->dn.on_stack = NULL;
+#endif
}
static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
@@ -1008,6 +1011,9 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
kmp_depnode_t node = {0};
__kmp_init_node(&node);
+#if KMP_DEBUG
+ node.dn.on_stack = thread;
+#endif
if (!__kmp_check_deps(gtid, &node, NULL, ¤t_task->td_dephash,
DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
@@ -1033,8 +1039,10 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
// Wait until the last __kmp_release_deps is finished before we free the
// current stack frame holding the "node" variable; once its nrefs count
// reaches 1, we're sure nobody else can try to reference it again.
- while (node.dn.nrefs > 1)
+ kmp_int32 nrefs;
+ while ((nrefs = node.dn.nrefs) > 1)
KMP_YIELD(TRUE);
+ KMP_DEBUG_ASSERT(nrefs == 1);
#if OMPT_SUPPORT
__ompt_taskwait_dep_finish(current_task, taskwait_task_data);
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index d2ab515158011a1..1d68eea45923bed 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -22,12 +22,19 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
if (!node)
return;
+#if KMP_DEBUG
+ if (node->dn.on_stack && node->dn.on_stack != thread)
+ fprintf (stderr, "on-stack depnode moved from thread %p to thread %p\n",
+ node->dn.on_stack, thread);
+#endif
+
kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
KMP_DEBUG_ASSERT(n >= 0);
if (n == 0) {
#if USE_ITT_BUILD && USE_ITT_NOTIFY
__itt_sync_destroy(node);
#endif
+ KMP_DEBUG_ASSERT(!node->dn.on_stack);
KMP_ASSERT(node->dn.nrefs == 0);
#if USE_FAST_MEMORY
__kmp_fast_free(thread, node);
>From 3a89d1575c77db630c565f8c0c5e54e129f46e5b Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 5 Feb 2025 10:48:45 -0600
Subject: [PATCH 2/3] [OpenMP] Allocate depnodes on heap not stack in
__kmpc_omp_taskwait_deps_51
This patch contains the fix for the work-stealing stack clobber with
task graph dependency nodes. A straightforward-seeming fix is to just
use heap allocation instead of stack allocation for the problematic
nodes in question. This also allows us to revert the fix from PR85963:
https://github.com/llvm/llvm-project/issues/85963
There may be some concern that despite fixing the issue, using heap rather
than stack allocation may have a negative performance impact. However
this does not appear to be the case: in fact I measured a small
performance boost with this patch, i.e. over 100 runs of the OpenMP_VV
test_taskwait_depend.c test with the iteration count N increased to
1024000, five times before & after the patch is applied:
pre-patch
=========
real 0m30.635s
user 45m55.276s
sys 1m24.537s
real 0m30.943s
user 47m5.018s
sys 1m19.218s
real 0m30.573s
user 45m56.023s
sys 1m21.273s
real 0m30.955s
user 47m6.567s
sys 1m25.121s
real 0m30.712s
user 46m51.830s
sys 1m22.085s
with patch
==========
real 0m28.245s
user 39m16.057s
sys 1m11.156s
real 0m28.385s
user 39m35.553s
sys 1m16.413s
real 0m28.421s
user 39m42.498s
sys 1m16.783s
real 0m29.207s
user 39m5.006s
sys 1m10.247s
real 0m29.118s
user 38m30.484s
sys 1m14.027s
(On an AMD system with 64 cores/256 hw threads.)
This is probably because we no longer need the wait loop in
__kmpc_omp_taskwait_deps_51.
---
openmp/runtime/src/kmp_taskdeps.cpp | 25 +++++++++++--------------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index fa8eac3ec31a194..d59ab077fe67bc3 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -1009,13 +1009,16 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
return;
}
- kmp_depnode_t node = {0};
- __kmp_init_node(&node);
-#if KMP_DEBUG
- node.dn.on_stack = thread;
+#if USE_FAST_MEMORY
+ kmp_depnode_t *node =
+ (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
+#else
+ kmp_depnode_t *node =
+ (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
#endif
+ __kmp_init_node(node);
- if (!__kmp_check_deps(gtid, &node, NULL, ¤t_task->td_dephash,
+ if (!__kmp_check_deps(gtid, node, NULL, ¤t_task->td_dephash,
DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
noalias_dep_list)) {
KA_TRACE(10, ("__kmpc_omp_taskwait_deps(exit): T#%d has no blocking "
@@ -1029,20 +1032,14 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
int thread_finished = FALSE;
kmp_flag_32<false, false> flag(
- (std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
- while (node.dn.npredecessors > 0) {
+ (std::atomic<kmp_uint32> *)&node->dn.npredecessors, 0U);
+ while (node->dn.npredecessors > 0) {
flag.execute_tasks(thread, gtid, FALSE,
&thread_finished USE_ITT_BUILD_ARG(NULL),
__kmp_task_stealing_constraint);
}
- // Wait until the last __kmp_release_deps is finished before we free the
- // current stack frame holding the "node" variable; once its nrefs count
- // reaches 1, we're sure nobody else can try to reference it again.
- kmp_int32 nrefs;
- while ((nrefs = node.dn.nrefs) > 1)
- KMP_YIELD(TRUE);
- KMP_DEBUG_ASSERT(nrefs == 1);
+ __kmp_node_deref(thread, node);
#if OMPT_SUPPORT
__ompt_taskwait_dep_finish(current_task, taskwait_task_data);
>From 5b0e5b72d2e21f11b4090655a2f5a38f1a2594d4 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 5 Feb 2025 14:07:21 -0600
Subject: [PATCH 3/3] [OpenMP] Clean up taskwait bug demonstration code
This patch removes the extra instrumentation to demonstrate the taskwait
bug described in previous patches. All three patches can be committed
together to reduce churn.
---
openmp/runtime/src/kmp.h | 3 ---
openmp/runtime/src/kmp_taskdeps.cpp | 3 ---
openmp/runtime/src/kmp_taskdeps.h | 7 -------
3 files changed, 13 deletions(-)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index efc07a4733d73b8..04bf6c3b34dace2 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2556,9 +2556,6 @@ typedef struct kmp_base_depnode {
#endif
std::atomic<kmp_int32> npredecessors;
std::atomic<kmp_int32> nrefs;
-#if KMP_DEBUG
- void *on_stack;
-#endif
} kmp_base_depnode_t;
union KMP_ALIGN_CACHE kmp_depnode {
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index d59ab077fe67bc3..90deb91afe7cbf0 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -48,9 +48,6 @@ static void __kmp_init_node(kmp_depnode_t *node) {
#if USE_ITT_BUILD && USE_ITT_NOTIFY
__itt_sync_create(node, "OMP task dep node", NULL, 0);
#endif
-#if KMP_DEBUG
- node->dn.on_stack = NULL;
-#endif
}
static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index 1d68eea45923bed..d2ab515158011a1 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -22,19 +22,12 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
if (!node)
return;
-#if KMP_DEBUG
- if (node->dn.on_stack && node->dn.on_stack != thread)
- fprintf (stderr, "on-stack depnode moved from thread %p to thread %p\n",
- node->dn.on_stack, thread);
-#endif
-
kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
KMP_DEBUG_ASSERT(n >= 0);
if (n == 0) {
#if USE_ITT_BUILD && USE_ITT_NOTIFY
__itt_sync_destroy(node);
#endif
- KMP_DEBUG_ASSERT(!node->dn.on_stack);
KMP_ASSERT(node->dn.nrefs == 0);
#if USE_FAST_MEMORY
__kmp_fast_free(thread, node);
More information about the Openmp-commits
mailing list